smftools 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. smftools/__init__.py +6 -8
  2. smftools/_settings.py +4 -6
  3. smftools/_version.py +1 -1
  4. smftools/cli/helpers.py +7 -1
  5. smftools/cli/hmm_adata.py +902 -244
  6. smftools/cli/load_adata.py +318 -198
  7. smftools/cli/preprocess_adata.py +285 -171
  8. smftools/cli/spatial_adata.py +137 -53
  9. smftools/cli_entry.py +94 -178
  10. smftools/config/__init__.py +1 -1
  11. smftools/config/conversion.yaml +5 -1
  12. smftools/config/deaminase.yaml +1 -1
  13. smftools/config/default.yaml +22 -17
  14. smftools/config/direct.yaml +8 -3
  15. smftools/config/discover_input_files.py +19 -5
  16. smftools/config/experiment_config.py +505 -276
  17. smftools/constants.py +37 -0
  18. smftools/datasets/__init__.py +2 -8
  19. smftools/datasets/datasets.py +32 -18
  20. smftools/hmm/HMM.py +2125 -1426
  21. smftools/hmm/__init__.py +2 -3
  22. smftools/hmm/archived/call_hmm_peaks.py +16 -1
  23. smftools/hmm/call_hmm_peaks.py +173 -193
  24. smftools/hmm/display_hmm.py +19 -6
  25. smftools/hmm/hmm_readwrite.py +13 -4
  26. smftools/hmm/nucleosome_hmm_refinement.py +102 -14
  27. smftools/informatics/__init__.py +30 -7
  28. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
  29. smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
  30. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
  31. smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
  32. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
  33. smftools/informatics/archived/print_bam_query_seq.py +7 -1
  34. smftools/informatics/bam_functions.py +379 -156
  35. smftools/informatics/basecalling.py +51 -9
  36. smftools/informatics/bed_functions.py +90 -57
  37. smftools/informatics/binarize_converted_base_identities.py +18 -7
  38. smftools/informatics/complement_base_list.py +7 -6
  39. smftools/informatics/converted_BAM_to_adata.py +265 -122
  40. smftools/informatics/fasta_functions.py +161 -83
  41. smftools/informatics/h5ad_functions.py +195 -29
  42. smftools/informatics/modkit_extract_to_adata.py +609 -270
  43. smftools/informatics/modkit_functions.py +85 -44
  44. smftools/informatics/ohe.py +44 -21
  45. smftools/informatics/pod5_functions.py +112 -73
  46. smftools/informatics/run_multiqc.py +20 -14
  47. smftools/logging_utils.py +51 -0
  48. smftools/machine_learning/__init__.py +2 -7
  49. smftools/machine_learning/data/anndata_data_module.py +143 -50
  50. smftools/machine_learning/data/preprocessing.py +2 -1
  51. smftools/machine_learning/evaluation/__init__.py +1 -1
  52. smftools/machine_learning/evaluation/eval_utils.py +11 -14
  53. smftools/machine_learning/evaluation/evaluators.py +46 -33
  54. smftools/machine_learning/inference/__init__.py +1 -1
  55. smftools/machine_learning/inference/inference_utils.py +7 -4
  56. smftools/machine_learning/inference/lightning_inference.py +9 -13
  57. smftools/machine_learning/inference/sklearn_inference.py +6 -8
  58. smftools/machine_learning/inference/sliding_window_inference.py +35 -25
  59. smftools/machine_learning/models/__init__.py +10 -5
  60. smftools/machine_learning/models/base.py +28 -42
  61. smftools/machine_learning/models/cnn.py +15 -11
  62. smftools/machine_learning/models/lightning_base.py +71 -40
  63. smftools/machine_learning/models/mlp.py +13 -4
  64. smftools/machine_learning/models/positional.py +3 -2
  65. smftools/machine_learning/models/rnn.py +3 -2
  66. smftools/machine_learning/models/sklearn_models.py +39 -22
  67. smftools/machine_learning/models/transformer.py +68 -53
  68. smftools/machine_learning/models/wrappers.py +2 -1
  69. smftools/machine_learning/training/__init__.py +2 -2
  70. smftools/machine_learning/training/train_lightning_model.py +29 -20
  71. smftools/machine_learning/training/train_sklearn_model.py +9 -15
  72. smftools/machine_learning/utils/__init__.py +1 -1
  73. smftools/machine_learning/utils/device.py +7 -4
  74. smftools/machine_learning/utils/grl.py +3 -1
  75. smftools/metadata.py +443 -0
  76. smftools/plotting/__init__.py +19 -5
  77. smftools/plotting/autocorrelation_plotting.py +145 -44
  78. smftools/plotting/classifiers.py +162 -72
  79. smftools/plotting/general_plotting.py +347 -168
  80. smftools/plotting/hmm_plotting.py +42 -13
  81. smftools/plotting/position_stats.py +145 -85
  82. smftools/plotting/qc_plotting.py +20 -12
  83. smftools/preprocessing/__init__.py +8 -8
  84. smftools/preprocessing/append_base_context.py +105 -79
  85. smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
  86. smftools/preprocessing/{archives → archived}/calculate_complexity.py +3 -1
  87. smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
  88. smftools/preprocessing/binarize.py +21 -4
  89. smftools/preprocessing/binarize_on_Youden.py +127 -31
  90. smftools/preprocessing/binary_layers_to_ohe.py +17 -11
  91. smftools/preprocessing/calculate_complexity_II.py +86 -59
  92. smftools/preprocessing/calculate_consensus.py +28 -19
  93. smftools/preprocessing/calculate_coverage.py +44 -22
  94. smftools/preprocessing/calculate_pairwise_differences.py +2 -1
  95. smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
  96. smftools/preprocessing/calculate_position_Youden.py +103 -55
  97. smftools/preprocessing/calculate_read_length_stats.py +52 -23
  98. smftools/preprocessing/calculate_read_modification_stats.py +91 -57
  99. smftools/preprocessing/clean_NaN.py +38 -28
  100. smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
  101. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +70 -37
  102. smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
  103. smftools/preprocessing/flag_duplicate_reads.py +688 -271
  104. smftools/preprocessing/invert_adata.py +26 -11
  105. smftools/preprocessing/load_sample_sheet.py +40 -22
  106. smftools/preprocessing/make_dirs.py +8 -3
  107. smftools/preprocessing/min_non_diagonal.py +2 -1
  108. smftools/preprocessing/recipes.py +56 -23
  109. smftools/preprocessing/reindex_references_adata.py +93 -27
  110. smftools/preprocessing/subsample_adata.py +33 -16
  111. smftools/readwrite.py +264 -109
  112. smftools/schema/__init__.py +11 -0
  113. smftools/schema/anndata_schema_v1.yaml +227 -0
  114. smftools/tools/__init__.py +3 -4
  115. smftools/tools/archived/classifiers.py +163 -0
  116. smftools/tools/archived/subset_adata_v1.py +10 -1
  117. smftools/tools/archived/subset_adata_v2.py +12 -1
  118. smftools/tools/calculate_umap.py +54 -15
  119. smftools/tools/cluster_adata_on_methylation.py +115 -46
  120. smftools/tools/general_tools.py +70 -25
  121. smftools/tools/position_stats.py +229 -98
  122. smftools/tools/read_stats.py +50 -29
  123. smftools/tools/spatial_autocorrelation.py +365 -192
  124. smftools/tools/subset_adata.py +23 -21
  125. {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/METADATA +15 -43
  126. smftools-0.2.5.dist-info/RECORD +181 -0
  127. smftools-0.2.4.dist-info/RECORD +0 -176
  128. /smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +0 -0
  129. /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
  130. /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
  131. {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
  132. {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
  133. {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,13 +1,19 @@
1
1
  import shutil
2
2
  from pathlib import Path
3
- from typing import Union, Iterable
3
+ from typing import Iterable, Union
4
+
5
+ from smftools.logging_utils import get_logger
4
6
 
5
7
  from .helpers import AdataPaths
6
8
 
9
+ logger = get_logger(__name__)
10
+
11
+
7
12
  def check_executable_exists(cmd: str) -> bool:
8
13
  """Return True if a command-line executable is available in PATH."""
9
14
  return shutil.which(cmd) is not None
10
15
 
16
+
11
17
  def delete_tsvs(
12
18
  tsv_dir: Union[str, Path, Iterable[str], None],
13
19
  *,
@@ -27,48 +33,50 @@ def delete_tsvs(
27
33
  verbose : bool
28
34
  Print progress / warnings.
29
35
  """
36
+
30
37
  # Helper: remove a single file path (Path-like or string)
31
38
  def _maybe_unlink(p: Path):
32
39
  if not p.exists():
33
40
  if verbose:
34
- print(f"[skip] not found: {p}")
41
+ logger.info(f"[skip] not found: {p}")
35
42
  return
36
43
  if not p.is_file():
37
44
  if verbose:
38
- print(f"[skip] not a file: {p}")
45
+ logger.info(f"[skip] not a file: {p}")
39
46
  return
40
47
  if dry_run:
41
- print(f"[dry-run] would remove file: {p}")
48
+ logger.info(f"[dry-run] would remove file: {p}")
42
49
  return
43
50
  try:
44
51
  p.unlink()
45
52
  if verbose:
46
- print(f"Removed file: {p}")
53
+ logger.info(f"Removed file: {p}")
47
54
  except Exception as e:
48
- print(f"[error] failed to remove file {p}: {e}")
55
+ logger.warning(f"Failed to remove file {p}: {e}")
49
56
 
50
57
  # Remove tmp_dir recursively (if provided)
51
58
  if tsv_dir is not None:
52
59
  td = Path(tsv_dir)
53
60
  if not td.exists():
54
61
  if verbose:
55
- print(f"[skip] tsv_dir not found: {td}")
62
+ logger.info(f"[skip] tsv_dir not found: {td}")
56
63
  else:
57
64
  if not td.is_dir():
58
65
  if verbose:
59
- print(f"[skip] tsv_dir is not a directory: {td}")
66
+ logger.info(f"[skip] tsv_dir is not a directory: {td}")
60
67
  else:
61
68
  if dry_run:
62
- print(f"[dry-run] would remove directory tree: {td}")
69
+ logger.info(f"[dry-run] would remove directory tree: {td}")
63
70
  else:
64
71
  try:
65
72
  shutil.rmtree(td)
66
73
  if verbose:
67
- print(f"Removed directory tree: {td}")
74
+ logger.info(f"Removed directory tree: {td}")
68
75
  except Exception as e:
69
- print(f"[error] failed to remove tmp dir {td}: {e}")
76
+ logger.warning(f"[error] failed to remove tmp dir {td}: {e}")
77
+
70
78
 
71
- def load_adata_core(cfg, paths: AdataPaths):
79
+ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
72
80
  """
73
81
  Core load pipeline.
74
82
 
@@ -97,28 +105,34 @@ def load_adata_core(cfg, paths: AdataPaths):
97
105
  cfg : ExperimentConfig
98
106
  (Same object, possibly with some fields updated, e.g. fasta path.)
99
107
  """
100
- import os
101
108
  from pathlib import Path
102
109
 
103
110
  import numpy as np
104
- import pandas as pd
105
- import anndata as ad
106
- import scanpy as sc
107
111
 
108
- from .helpers import write_gz_h5ad
109
-
110
- from ..readwrite import make_dirs, add_or_update_column_in_csv
111
-
112
- from ..informatics.bam_functions import concatenate_fastqs_to_bam, align_and_sort_BAM, demux_and_index_BAM, split_and_index_BAM, bam_qc, extract_read_features_from_bam
112
+ from ..informatics.bam_functions import (
113
+ align_and_sort_BAM,
114
+ bam_qc,
115
+ concatenate_fastqs_to_bam,
116
+ demux_and_index_BAM,
117
+ extract_read_features_from_bam,
118
+ split_and_index_BAM,
119
+ )
120
+ from ..informatics.basecalling import canoncall, modcall
113
121
  from ..informatics.bed_functions import aligned_BAM_to_bed
114
- from ..informatics.pod5_functions import fast5_to_pod5
115
- from ..informatics.fasta_functions import subsample_fasta_from_bed, generate_converted_FASTA, get_chromosome_lengths
116
- from ..informatics.basecalling import modcall, canoncall
117
- from ..informatics.modkit_functions import modQC, make_modbed, extract_mods
118
- from ..informatics.modkit_extract_to_adata import modkit_extract_to_adata
119
122
  from ..informatics.converted_BAM_to_adata import converted_BAM_to_adata
123
+ from ..informatics.fasta_functions import (
124
+ generate_converted_FASTA,
125
+ get_chromosome_lengths,
126
+ subsample_fasta_from_bed,
127
+ )
120
128
  from ..informatics.h5ad_functions import add_read_length_and_mapping_qc
129
+ from ..informatics.modkit_extract_to_adata import modkit_extract_to_adata
130
+ from ..informatics.modkit_functions import extract_mods, make_modbed, modQC
131
+ from ..informatics.pod5_functions import fast5_to_pod5
121
132
  from ..informatics.run_multiqc import run_multiqc
133
+ from ..metadata import record_smftools_metadata
134
+ from ..readwrite import add_or_update_column_in_csv, make_dirs
135
+ from .helpers import write_gz_h5ad
122
136
 
123
137
  ################################### 1) General params and input organization ###################################
124
138
  output_directory = Path(cfg.output_directory)
@@ -169,19 +183,20 @@ def load_adata_core(cfg, paths: AdataPaths):
169
183
  if cfg.aligner == "minimap2":
170
184
  if not check_executable_exists("minimap2"):
171
185
  raise RuntimeError(
172
- "Error: 'minimap2' is not installed or not in PATH. "
173
- "Install minimap2"
186
+ "Error: 'minimap2' is not installed or not in PATH. Install minimap2"
174
187
  )
175
188
 
176
189
  # # Detect the input filetypes
177
190
  # If the input files are fast5 files, convert the files to a pod5 file before proceeding.
178
191
  if cfg.input_type == "fast5":
179
192
  # take the input directory of fast5 files and write out a single pod5 file into the output directory.
180
- output_pod5 = cfg.output_directory / 'FAST5s_to_POD5.pod5'
193
+ output_pod5 = cfg.output_directory / "FAST5s_to_POD5.pod5"
181
194
  if output_pod5.exists():
182
195
  pass
183
196
  else:
184
- print(f'Input directory contains fast5 files, converting them and concatenating into a single pod5 file in the {output_pod5}')
197
+ logger.info(
198
+ f"Input directory contains fast5 files, converting them and concatenating into a single pod5 file in the {output_pod5}"
199
+ )
185
200
  fast5_to_pod5(cfg.input_data_path, output_pod5)
186
201
  # Reassign the pod5_dir variable to point to the new pod5 file.
187
202
  cfg.input_data_path = output_pod5
@@ -189,22 +204,24 @@ def load_adata_core(cfg, paths: AdataPaths):
189
204
  # If the input is a fastq or a directory of fastqs, concatenate them into an unaligned BAM and save the barcode
190
205
  elif cfg.input_type == "fastq":
191
206
  # Output file for FASTQ concatenation.
192
- output_bam = cfg.output_directory / 'canonical_basecalls.bam'
207
+ output_bam = cfg.output_directory / "canonical_basecalls.bam"
193
208
  if output_bam.exists():
194
- pass
209
+ logger.debug("Output BAM already exists")
195
210
  else:
211
+ logger.info("Concatenating FASTQ files into a single BAM file")
196
212
  summary = concatenate_fastqs_to_bam(
197
213
  cfg.input_files,
198
214
  output_bam,
199
- barcode_tag='BC',
200
- gzip_suffixes=('.gz','.gzip'),
215
+ barcode_tag="BC",
216
+ gzip_suffixes=(".gz", ".gzip"),
201
217
  barcode_map=cfg.fastq_barcode_map,
202
218
  add_read_group=True,
203
219
  rg_sample_field=None,
204
220
  progress=False,
205
- auto_pair=cfg.fastq_auto_pairing)
206
-
207
- print(f"Found the following barcodes: {summary['barcodes']}")
221
+ auto_pair=cfg.fastq_auto_pairing,
222
+ )
223
+
224
+ logger.info(f"Found the following barcodes in FASTQ inputs: {summary['barcodes']}")
208
225
 
209
226
  # Set the input data path to the concatenated BAM.
210
227
  cfg.input_data_path = output_bam
@@ -213,24 +230,24 @@ def load_adata_core(cfg, paths: AdataPaths):
213
230
  pass
214
231
  else:
215
232
  pass
216
-
233
+
217
234
  add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
218
235
 
219
236
  # Determine if the input data needs to be basecalled
220
237
  if cfg.input_type == "pod5":
221
- print(f'Detected pod5 inputs: {cfg.input_files}')
238
+ logger.info(f"Detected pod5 inputs: {cfg.input_files}")
222
239
  basecall = True
223
240
  elif cfg.input_type in ["bam"]:
224
- print(f'Detected bam input: {cfg.input_files}')
241
+ logger.info(f"Detected bam input: {cfg.input_files}")
225
242
  basecall = False
226
243
  else:
227
- print('Error, can not find input bam or pod5')
244
+ logger.info("Error, can not find input bam or pod5")
228
245
 
229
246
  # Generate the base name of the unaligned bam without the .bam suffix
230
247
  if basecall:
231
248
  model_basename = Path(cfg.model).name
232
- model_basename = str(model_basename).replace('.', '_')
233
- if cfg.smf_modality == 'direct':
249
+ model_basename = str(model_basename).replace(".", "_")
250
+ if cfg.smf_modality == "direct":
234
251
  mod_string = "_".join(cfg.mod_list)
235
252
  bam = cfg.output_directory / f"{model_basename}_{mod_string}_calls"
236
253
  else:
@@ -241,7 +258,9 @@ def load_adata_core(cfg, paths: AdataPaths):
241
258
 
242
259
  # Generate path names for the unaligned, aligned, as well as the aligned/sorted bam.
243
260
  unaligned_output = bam.with_suffix(cfg.bam_suffix)
244
- aligned_BAM = cfg.output_directory / (bam.stem + "_aligned") # doing this allows specifying an input bam in a seperate directory as the aligned output bams
261
+ aligned_BAM = (
262
+ cfg.output_directory / (bam.stem + "_aligned")
263
+ ) # doing this allows specifying an input bam in a seperate directory as the aligned output bams
245
264
  aligned_output = aligned_BAM.with_suffix(cfg.bam_suffix)
246
265
  aligned_sorted_BAM = aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
247
266
  aligned_sorted_output = aligned_sorted_BAM.with_suffix(cfg.bam_suffix)
@@ -252,34 +271,40 @@ def load_adata_core(cfg, paths: AdataPaths):
252
271
  ########################################################################################################################
253
272
 
254
273
  ################################### 2) FASTA Handling ###################################
255
- from ..informatics.fasta_functions import generate_converted_FASTA, get_chromosome_lengths
256
274
 
257
275
  try:
258
276
  cfg.fasta = Path(cfg.fasta)
259
- except:
260
- print("Need to provide an input FASTA path to proceed with smftools load")
277
+ except Exception:
278
+ logger.warning("Need to provide an input FASTA path to proceed with smftools load")
261
279
 
262
280
  # If fasta_regions_of_interest bed is passed, subsample the input FASTA on regions of interest and use the subsampled FASTA.
263
- if cfg.fasta_regions_of_interest and '.bed' in cfg.fasta_regions_of_interest:
264
- fasta_basename = cfg.fasta.parent / cfg.fasta.stem
265
- bed_basename_minus_suffix = Path(cfg.fasta_regions_of_interest).stem
266
- output_FASTA = fasta_basename.with_name(fasta_basename.name + '_subsampled_by_' + bed_basename_minus_suffix + '.fasta')
267
- subsample_fasta_from_bed(cfg.fasta, cfg.fasta_regions_of_interest, cfg.output_directory, output_FASTA)
268
- fasta = cfg.output_directory / output_FASTA
281
+ if cfg.fasta_regions_of_interest and ".bed" in cfg.fasta_regions_of_interest:
282
+ fasta_stem = cfg.fasta.stem
283
+ bed_stem = Path(cfg.fasta_regions_of_interest).stem
284
+ output_FASTA = cfg.output_directory / f"{fasta_stem}_subsampled_by_{bed_stem}.fasta"
285
+
286
+ logger.info("Subsampling FASTA records using the provided BED file")
287
+ subsample_fasta_from_bed(
288
+ cfg.fasta, cfg.fasta_regions_of_interest, cfg.output_directory, output_FASTA
289
+ )
290
+ fasta = output_FASTA
269
291
  else:
292
+ logger.info("Using the full FASTA file")
270
293
  fasta = cfg.fasta
271
294
 
272
295
  # For conversion style SMF, make a converted reference FASTA
273
- if cfg.smf_modality == 'conversion':
274
- fasta_basename = fasta.parent / fasta.stem
275
- converted_FASTA_basename = fasta_basename.with_name(fasta_basename.name + '_converted.fasta')
296
+ if cfg.smf_modality == "conversion":
297
+ fasta_stem = fasta.stem
298
+ converted_FASTA_basename = f"{fasta_stem}_converted.fasta"
276
299
  converted_FASTA = cfg.output_directory / converted_FASTA_basename
277
- if 'converted.fa' in fasta.name:
278
- print(f'{fasta} is already converted. Using existing converted FASTA.')
300
+
301
+ if "converted.fa" in fasta.name:
302
+ logger.info(f"{fasta} is already converted. Using existing converted FASTA.")
279
303
  converted_FASTA = fasta
280
304
  elif converted_FASTA.exists():
281
- print(f'{converted_FASTA} already exists. Using existing converted FASTA.')
305
+ logger.info(f"{converted_FASTA} already exists. Using existing converted FASTA.")
282
306
  else:
307
+ logger.info(f"Converting FASTA base sequences")
283
308
  generate_converted_FASTA(fasta, cfg.conversion_types, cfg.strands, converted_FASTA)
284
309
  fasta = converted_FASTA
285
310
 
@@ -290,121 +315,164 @@ def load_adata_core(cfg, paths: AdataPaths):
290
315
  ########################################################################################################################
291
316
 
292
317
  ################################### 3) Basecalling ###################################
293
- from ..informatics.basecalling import modcall, canoncall
318
+
294
319
  # 1) Basecall using dorado
295
- if basecall and cfg.sequencer == 'ont':
320
+ if basecall and cfg.sequencer == "ont":
296
321
  try:
297
322
  cfg.model_dir = Path(cfg.model_dir)
298
- except:
299
- print("Need to provide a valid path to a dorado model directory to use dorado basecalling")
323
+ except Exception:
324
+ logger.warning(
325
+ "Need to provide a valid path to a dorado model directory to use dorado basecalling"
326
+ )
300
327
  if aligned_sorted_output.exists():
301
- print(f'{aligned_sorted_output} already exists. Using existing basecalled, aligned, sorted BAM.')
328
+ logger.info(
329
+ f"{aligned_sorted_output} already exists. Using existing basecalled, aligned, sorted BAM."
330
+ )
302
331
  elif unaligned_output.exists():
303
- print(f'{unaligned_output} already exists. Using existing basecalled BAM.')
304
- elif cfg.smf_modality != 'direct':
305
- canoncall(str(cfg.model_dir), cfg.model, str(cfg.input_data_path), cfg.barcode_kit, str(bam), cfg.bam_suffix, cfg.barcode_both_ends, cfg.trim, cfg.device)
332
+ logger.info(f"{unaligned_output} already exists. Using existing basecalled BAM.")
333
+ elif cfg.smf_modality != "direct":
334
+ logger.info("Running canonical basecalling using dorado")
335
+ canoncall(
336
+ str(cfg.model_dir),
337
+ cfg.model,
338
+ str(cfg.input_data_path),
339
+ cfg.barcode_kit,
340
+ str(bam),
341
+ cfg.bam_suffix,
342
+ cfg.barcode_both_ends,
343
+ cfg.trim,
344
+ cfg.device,
345
+ )
306
346
  else:
307
- modcall(str(cfg.model_dir), cfg.model, str(cfg.input_data_path), cfg.barcode_kit, cfg.mod_list, str(bam), cfg.bam_suffix, cfg.barcode_both_ends, cfg.trim, cfg.device)
347
+ logger.info("Running modified basecalling using dorado")
348
+ modcall(
349
+ str(cfg.model_dir),
350
+ cfg.model,
351
+ str(cfg.input_data_path),
352
+ cfg.barcode_kit,
353
+ cfg.mod_list,
354
+ str(bam),
355
+ cfg.bam_suffix,
356
+ cfg.barcode_both_ends,
357
+ cfg.trim,
358
+ cfg.device,
359
+ )
308
360
  elif basecall:
309
- print(f"Basecalling is currently only supported for ont sequencers and not pacbio.")
361
+ logger.error("Basecalling is currently only supported for ont sequencers and not pacbio.")
310
362
  else:
311
363
  pass
312
364
  ########################################################################################################################
313
365
 
314
366
  ################################### 4) Alignment and sorting #############################################
315
- from ..informatics.bam_functions import align_and_sort_BAM
316
- from ..informatics.bed_functions import aligned_BAM_to_bed
367
+
317
368
  # 3) Align the BAM to the reference FASTA and sort the bam on positional coordinates. Also make an index and a bed file of mapped reads
318
369
  if aligned_sorted_output.exists():
319
- print(f'{aligned_sorted_output} already exists. Using existing aligned/sorted BAM.')
370
+ logger.debug(f"{aligned_sorted_output} already exists. Using existing aligned/sorted BAM.")
320
371
  else:
372
+ logger.info(f"Aligning and sorting reads")
321
373
  align_and_sort_BAM(fasta, unaligned_output, cfg)
322
374
  # Deleted the unsorted aligned output
323
375
  aligned_output.unlink()
324
376
 
325
377
  if cfg.make_beds:
326
378
  # Make beds and provide basic histograms
327
- bed_dir = cfg.output_directory / 'beds'
379
+ bed_dir = cfg.output_directory / "beds"
328
380
  if bed_dir.is_dir():
329
- print(f'{bed_dir} already exists. Skipping BAM -> BED conversion for {aligned_sorted_output}')
381
+ logger.debug(
382
+ f"{bed_dir} already exists. Skipping BAM -> BED conversion for {aligned_sorted_output}"
383
+ )
330
384
  else:
331
- aligned_BAM_to_bed(aligned_sorted_output, cfg.output_directory, fasta, cfg.make_bigwigs, cfg.threads)
385
+ logger.info("Making bed files from the aligned and sorted BAM file")
386
+ aligned_BAM_to_bed(
387
+ aligned_sorted_output, cfg.output_directory, fasta, cfg.make_bigwigs, cfg.threads
388
+ )
332
389
  ########################################################################################################################
333
390
 
334
391
  ################################### 5) Demultiplexing ######################################################################
335
- from ..informatics.bam_functions import demux_and_index_BAM, split_and_index_BAM
392
+
336
393
  # 3) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory
337
394
  if cfg.input_already_demuxed:
338
395
  if cfg.split_path.is_dir():
339
- print(f"{cfg.split_path} already exists. Using existing demultiplexed BAMs.")
396
+ logger.debug(f"{cfg.split_path} already exists. Using existing demultiplexed BAMs.")
340
397
 
341
398
  all_bam_files = sorted(
342
- p for p in cfg.split_path.iterdir()
343
- if p.is_file()
344
- and p.suffix == cfg.bam_suffix
399
+ p for p in cfg.split_path.iterdir() if p.is_file() and p.suffix == cfg.bam_suffix
345
400
  )
346
401
  unclassified_bams = [p for p in all_bam_files if "unclassified" in p.name]
347
402
  bam_files = [p for p in all_bam_files if "unclassified" not in p.name]
348
403
 
349
404
  else:
350
405
  make_dirs([cfg.split_path])
351
- all_bam_files = split_and_index_BAM(aligned_sorted_BAM,
352
- cfg.split_path,
353
- cfg.bam_suffix)
354
-
406
+ logger.info("Demultiplexing samples into individual aligned/sorted BAM files")
407
+ all_bam_files = split_and_index_BAM(aligned_sorted_BAM, cfg.split_path, cfg.bam_suffix)
408
+
355
409
  unclassified_bams = [p for p in all_bam_files if "unclassified" in p.name]
356
410
  bam_files = sorted(p for p in all_bam_files if "unclassified" not in p.name)
357
411
 
358
412
  se_bam_files = bam_files
359
413
  bam_dir = cfg.split_path
360
-
414
+
361
415
  else:
362
416
  if single_barcoded_path.is_dir():
363
- print(f"{single_barcoded_path} already exists. Using existing single ended demultiplexed BAMs.")
417
+ logger.debug(
418
+ f"{single_barcoded_path} already exists. Using existing single ended demultiplexed BAMs."
419
+ )
364
420
 
365
421
  all_se_bam_files = sorted(
366
- p for p in single_barcoded_path.iterdir()
367
- if p.is_file()
368
- and p.suffix == cfg.bam_suffix
369
- )
422
+ p
423
+ for p in single_barcoded_path.iterdir()
424
+ if p.is_file() and p.suffix == cfg.bam_suffix
425
+ )
370
426
  unclassified_se_bams = [p for p in all_se_bam_files if "unclassified" in p.name]
371
427
  se_bam_files = [p for p in all_se_bam_files if "unclassified" not in p.name]
372
428
  else:
373
- make_dirs([cfg.split_path, single_barcoded_path])
374
- all_se_bam_files = demux_and_index_BAM(aligned_sorted_BAM,
375
- single_barcoded_path,
376
- cfg.bam_suffix,
377
- cfg.barcode_kit,
378
- False,
379
- cfg.trim,
380
- cfg.threads)
381
-
429
+ make_dirs([cfg.split_path, single_barcoded_path])
430
+ logger.info(
431
+ "Demultiplexing samples into individual aligned/sorted BAM files based on single end barcode status with Dorado"
432
+ )
433
+ all_se_bam_files = demux_and_index_BAM(
434
+ aligned_sorted_BAM,
435
+ single_barcoded_path,
436
+ cfg.bam_suffix,
437
+ cfg.barcode_kit,
438
+ False,
439
+ cfg.trim,
440
+ cfg.threads,
441
+ )
442
+
382
443
  unclassified_se_bams = [p for p in all_se_bam_files if "unclassified" in p.name]
383
444
  se_bam_files = [p for p in all_se_bam_files if "unclassified" not in p.name]
384
-
445
+
385
446
  if double_barcoded_path.is_dir():
386
- print(f"{double_barcoded_path} already exists. Using existing double ended demultiplexed BAMs.")
447
+ logger.debug(
448
+ f"{double_barcoded_path} already exists. Using existing double ended demultiplexed BAMs."
449
+ )
387
450
 
388
451
  all_de_bam_files = sorted(
389
- p for p in double_barcoded_path.iterdir()
390
- if p.is_file()
391
- and p.suffix == cfg.bam_suffix
392
- )
452
+ p
453
+ for p in double_barcoded_path.iterdir()
454
+ if p.is_file() and p.suffix == cfg.bam_suffix
455
+ )
393
456
  unclassified_de_bams = [p for p in all_de_bam_files if "unclassified" in p.name]
394
457
  de_bam_files = [p for p in all_de_bam_files if "unclassified" not in p.name]
395
- else:
396
- make_dirs([cfg.split_path, double_barcoded_path])
397
- all_de_bam_files = demux_and_index_BAM(aligned_sorted_BAM,
398
- double_barcoded_path,
399
- cfg.bam_suffix,
400
- cfg.barcode_kit,
401
- True,
402
- cfg.trim,
403
- cfg.threads)
404
-
458
+ else:
459
+ make_dirs([cfg.split_path, double_barcoded_path])
460
+ logger.info(
461
+ "Demultiplexing samples into individual aligned/sorted BAM files based on double end barcode status with Dorado"
462
+ )
463
+ all_de_bam_files = demux_and_index_BAM(
464
+ aligned_sorted_BAM,
465
+ double_barcoded_path,
466
+ cfg.bam_suffix,
467
+ cfg.barcode_kit,
468
+ True,
469
+ cfg.trim,
470
+ cfg.threads,
471
+ )
472
+
405
473
  unclassified_de_bams = [p for p in all_de_bam_files if "unclassified" in p.name]
406
474
  de_bam_files = [p for p in all_de_bam_files if "unclassified" not in p.name]
407
-
475
+
408
476
  bam_files = se_bam_files + de_bam_files
409
477
  unclassified_bams = unclassified_se_bams + unclassified_de_bams
410
478
  bam_dir = single_barcoded_path
@@ -413,138 +481,192 @@ def load_adata_core(cfg, paths: AdataPaths):
413
481
 
414
482
  if cfg.make_beds:
415
483
  # Make beds and provide basic histograms
416
- bed_dir = cfg.split_path / 'beds'
484
+ bed_dir = cfg.split_path / "beds"
417
485
  if bed_dir.is_dir():
418
- print(f'{bed_dir} already exists. Skipping BAM -> BED conversion for demultiplexed bams')
486
+ logger.debug(
487
+ f"{bed_dir} already exists. Skipping BAM -> BED conversion for demultiplexed bams"
488
+ )
419
489
  else:
490
+ logger.info("Making BED files from BAM files for each sample")
420
491
  for bam in bam_files:
421
492
  aligned_BAM_to_bed(bam, cfg.split_path, fasta, cfg.make_bigwigs, cfg.threads)
422
493
  ########################################################################################################################
423
494
 
424
495
  ################################### 6) SAMTools based BAM QC ######################################################################
425
- from ..informatics.bam_functions import bam_qc
496
+
426
497
  # 5) Samtools QC metrics on split BAM files
427
498
  bam_qc_dir = cfg.split_path / "bam_qc"
428
499
  if bam_qc_dir.is_dir():
429
- print( f'{bam_qc_dir} already exists. Using existing BAM QC calculations.')
500
+ logger.debug(f"{bam_qc_dir} already exists. Using existing BAM QC calculations.")
430
501
  else:
431
502
  make_dirs([bam_qc_dir])
503
+ logger.info("Performing BAM QC")
432
504
  bam_qc(bam_files, bam_qc_dir, cfg.threads, modality=cfg.smf_modality)
433
- ########################################################################################################################
505
+ ########################################################################################################################
434
506
 
435
507
  ################################### 7) AnnData loading ######################################################################
436
- if cfg.smf_modality != 'direct':
508
+ if cfg.smf_modality != "direct":
437
509
  from ..informatics.converted_BAM_to_adata import converted_BAM_to_adata
510
+
438
511
  # 6) Take the converted BAM and load it into an adata object.
439
- if cfg.smf_modality == 'deaminase':
512
+ if cfg.smf_modality == "deaminase":
440
513
  deaminase_footprinting = True
441
514
  else:
442
515
  deaminase_footprinting = False
443
- raw_adata, raw_adata_path = converted_BAM_to_adata(fasta,
444
- bam_dir,
445
- cfg.output_directory,
446
- cfg.input_already_demuxed,
447
- cfg.mapping_threshold,
448
- cfg.experiment_name,
449
- cfg.conversion_types,
450
- cfg.bam_suffix,
451
- cfg.device,
452
- cfg.threads,
453
- deaminase_footprinting,
454
- delete_intermediates=cfg.delete_intermediate_hdfs,
455
- double_barcoded_path=double_barcoded_path)
516
+
517
+ logger.info(f"Loading Anndata from BAM files for {cfg.smf_modality} footprinting")
518
+ raw_adata, raw_adata_path = converted_BAM_to_adata(
519
+ fasta,
520
+ bam_dir,
521
+ cfg.output_directory,
522
+ cfg.input_already_demuxed,
523
+ cfg.mapping_threshold,
524
+ cfg.experiment_name,
525
+ cfg.conversion_types,
526
+ cfg.bam_suffix,
527
+ cfg.device,
528
+ cfg.threads,
529
+ deaminase_footprinting,
530
+ delete_intermediates=cfg.delete_intermediate_hdfs,
531
+ double_barcoded_path=double_barcoded_path,
532
+ )
456
533
  else:
457
534
  if mod_bed_dir.is_dir():
458
- print(f'{mod_bed_dir} already exists, skipping making modbeds')
535
+ logger.debug(f"{mod_bed_dir} already exists, skipping making modbeds")
459
536
  else:
460
- from ..informatics.modkit_functions import modQC, make_modbed
461
- make_dirs([mod_bed_dir])
462
-
463
- modQC(aligned_sorted_output,
464
- cfg.thresholds) # get QC metrics for mod calls
465
-
466
- make_modbed(aligned_sorted_output,
467
- cfg.thresholds,
468
- mod_bed_dir) # Generate bed files of position methylation summaries for every sample
469
-
537
+ from ..informatics.modkit_functions import make_modbed, modQC
538
+
539
+ make_dirs([mod_bed_dir])
540
+
541
+ logger.info("Performing modQC for direct footprinting samples")
542
+
543
+ modQC(aligned_sorted_output, cfg.thresholds) # get QC metrics for mod calls
544
+
545
+ logger.info("Making modified BED files for direct footprinting samples")
546
+
547
+ make_modbed(
548
+ aligned_sorted_output, cfg.thresholds, mod_bed_dir
549
+ ) # Generate bed files of position methylation summaries for every sample
550
+
470
551
  from ..informatics.modkit_functions import extract_mods
552
+
471
553
  make_dirs([mod_tsv_dir])
472
554
 
473
- extract_mods(cfg.thresholds,
474
- mod_tsv_dir,
475
- bam_dir,
476
- cfg.bam_suffix,
477
- skip_unclassified=cfg.skip_unclassified,
478
- modkit_summary=False,
479
- threads=cfg.threads) # Extract methylations calls for split BAM files into split TSV files
480
-
555
+ logger.info(
556
+ "Extracting single read modification states into TSVs for direct footprinting samples"
557
+ )
558
+
559
+ extract_mods(
560
+ cfg.thresholds,
561
+ mod_tsv_dir,
562
+ bam_dir,
563
+ cfg.bam_suffix,
564
+ skip_unclassified=cfg.skip_unclassified,
565
+ modkit_summary=False,
566
+ threads=cfg.threads,
567
+ ) # Extract methylations calls for split BAM files into split TSV files
568
+
481
569
  from ..informatics.modkit_extract_to_adata import modkit_extract_to_adata
482
- #6 Load the modification data from TSVs into an adata object
483
- raw_adata, raw_adata_path = modkit_extract_to_adata(fasta,
484
- bam_dir,
485
- cfg.output_directory,
486
- cfg.input_already_demuxed,
487
- cfg.mapping_threshold,
488
- cfg.experiment_name,
489
- mods,
490
- cfg.batch_size,
491
- mod_tsv_dir,
492
- cfg.delete_batch_hdfs,
493
- cfg.threads,
494
- double_barcoded_path)
570
+
571
+ logger.info("Making Anndata for direct modification detection SMF samples")
572
+
573
+ # 6 Load the modification data from TSVs into an adata object
574
+ raw_adata, raw_adata_path = modkit_extract_to_adata(
575
+ fasta,
576
+ bam_dir,
577
+ cfg.output_directory,
578
+ cfg.input_already_demuxed,
579
+ cfg.mapping_threshold,
580
+ cfg.experiment_name,
581
+ mods,
582
+ cfg.batch_size,
583
+ mod_tsv_dir,
584
+ cfg.delete_batch_hdfs,
585
+ cfg.threads,
586
+ double_barcoded_path,
587
+ )
495
588
  if cfg.delete_intermediate_tsvs:
496
589
  delete_tsvs(mod_tsv_dir)
497
590
 
498
- raw_adata.obs['Experiment_name'] = [cfg.experiment_name] * raw_adata.shape[0]
499
- raw_adata.obs['Experiment_name_and_barcode'] = (raw_adata.obs['Experiment_name'].astype(str) + "_" + raw_adata.obs['Barcode'].astype(str))
591
+ raw_adata.obs["Experiment_name"] = [cfg.experiment_name] * raw_adata.shape[0]
592
+ raw_adata.obs["Experiment_name_and_barcode"] = (
593
+ raw_adata.obs["Experiment_name"].astype(str) + "_" + raw_adata.obs["Barcode"].astype(str)
594
+ )
500
595
 
501
596
  ########################################################################################################################
502
597
 
503
598
  ############################################### Add basic read length, read quality, mapping quality stats ###############################################
504
- from ..informatics.h5ad_functions import add_read_length_and_mapping_qc
505
- from ..informatics.bam_functions import extract_read_features_from_bam
506
- add_read_length_and_mapping_qc(raw_adata, se_bam_files,
507
- extract_read_features_from_bam_callable=extract_read_features_from_bam,
508
- bypass=cfg.bypass_add_read_length_and_mapping_qc,
509
- force_redo=cfg.force_redo_add_read_length_and_mapping_qc)
510
599
 
511
- raw_adata.obs['Raw_modification_signal'] = np.nansum(raw_adata.X, axis=1)
600
+ logger.info("Adding read length, mapping quality, and modification signal to Anndata")
601
+ add_read_length_and_mapping_qc(
602
+ raw_adata,
603
+ se_bam_files,
604
+ extract_read_features_from_bam_callable=extract_read_features_from_bam,
605
+ bypass=cfg.bypass_add_read_length_and_mapping_qc,
606
+ force_redo=cfg.force_redo_add_read_length_and_mapping_qc,
607
+ )
608
+
609
+ raw_adata.obs["Raw_modification_signal"] = np.nansum(raw_adata.X, axis=1)
610
+ ########################################################################################################################
611
+
612
+ ############################################### if input data type was pod5, append the pod5 file origin to each read ###############################################
613
+ from ..informatics.h5ad_functions import annotate_pod5_origin
614
+
615
+ if cfg.input_type == "pod5":
616
+ logger.info("Adding the POD5 origin file to each read into Anndata")
617
+ annotate_pod5_origin(
618
+ raw_adata,
619
+ cfg.input_data_path,
620
+ n_jobs=cfg.threads,
621
+ csv_path=output_directory / "read_to_pod5_origin_mapping.csv",
622
+ )
512
623
  ########################################################################################################################
513
624
 
514
625
  ############################################### Save final adata ###############################################
515
- print(f"Saving AnnData to {raw_adata_path}")
626
+ logger.info(f"Saving AnnData to {raw_adata_path}")
627
+ record_smftools_metadata(
628
+ raw_adata,
629
+ step_name="load",
630
+ cfg=cfg,
631
+ config_path=config_path,
632
+ output_path=raw_adata_path,
633
+ )
516
634
  write_gz_h5ad(raw_adata, raw_adata_path)
517
635
  ########################################################################################################################
518
636
 
519
637
  ############################################### MultiQC HTML Report ###############################################
520
- from ..informatics.run_multiqc import run_multiqc
638
+
521
639
  # multiqc ###
522
640
  mqc_dir = cfg.split_path / "multiqc"
523
641
  if mqc_dir.is_dir():
524
- print(f'{mqc_dir} already exists, skipping multiqc')
642
+ logger.debug(f"{mqc_dir} already exists, skipping multiqc")
525
643
  else:
644
+ logger.info("Running multiqc")
526
645
  run_multiqc(cfg.split_path, mqc_dir)
527
646
  ########################################################################################################################
528
647
 
529
648
  ############################################### delete intermediate BAM files ###############################################
530
649
  if cfg.delete_intermediate_bams:
650
+ logger.info("Deleting intermediate BAM files")
531
651
  # delete aligned and sorted bam
532
652
  aligned_sorted_output.unlink()
533
- bai = aligned_sorted_output.parent / (aligned_sorted_output.name + '.bai')
653
+ bai = aligned_sorted_output.parent / (aligned_sorted_output.name + ".bai")
534
654
  bai.unlink()
535
655
  # delete the demultiplexed bams. Keep the demultiplexing summary files and directories to faciliate demultiplexing in the future with these files
536
656
  for bam in bam_files:
537
- bai = bam.parent / (bam.name + '.bai')
657
+ bai = bam.parent / (bam.name + ".bai")
538
658
  bam.unlink()
539
659
  bai.unlink()
540
660
  for bam in unclassified_bams:
541
- bai = bam.parent / (bam.name + '.bai')
661
+ bai = bam.parent / (bam.name + ".bai")
542
662
  bam.unlink()
543
- bai.unlink()
663
+ bai.unlink()
664
+ logger.info("Finished deleting intermediate BAM files")
544
665
  ########################################################################################################################
545
666
 
546
667
  return raw_adata, raw_adata_path, cfg
547
668
 
669
+
548
670
  def load_adata(config_path: str):
549
671
  """
550
672
  CLI-facing wrapper for the load pipeline.
@@ -565,15 +687,11 @@ def load_adata(config_path: str):
565
687
  cfg : ExperimentConfig
566
688
  Config object for downstream steps.
567
689
  """
568
- from importlib import resources
569
690
  from datetime import datetime
570
- from pathlib import Path
571
-
572
- import pandas as pd # used for summary file reading downstream if needed
573
-
574
- from ..readwrite import make_dirs, add_or_update_column_in_csv
575
- from ..config import LoadExperimentConfig, ExperimentConfig
691
+ from importlib import resources
576
692
 
693
+ from ..config import ExperimentConfig, LoadExperimentConfig
694
+ from ..readwrite import add_or_update_column_in_csv, make_dirs
577
695
  from .helpers import get_adata_paths
578
696
 
579
697
  date_str = datetime.today().strftime("%y%m%d")
@@ -613,25 +731,27 @@ def load_adata(config_path: str):
613
731
  # -----------------------------
614
732
  if not getattr(cfg, "force_redo_load_adata", False):
615
733
  if paths.hmm.exists():
616
- print(f"HMM AnnData already exists: {paths.hmm}\nSkipping smftools load")
734
+ logger.debug(f"HMM AnnData already exists: {paths.hmm}\nSkipping smftools load")
617
735
  return None, paths.hmm, cfg
618
736
  if paths.spatial.exists():
619
- print(f"Spatial AnnData already exists: {paths.spatial}\nSkipping smftools load")
737
+ logger.debug(f"Spatial AnnData already exists: {paths.spatial}\nSkipping smftools load")
620
738
  return None, paths.spatial, cfg
621
739
  if paths.pp_dedup.exists():
622
- print(
740
+ logger.debug(
623
741
  f"Preprocessed deduplicated AnnData already exists: {paths.pp_dedup}\n"
624
742
  f"Skipping smftools load"
625
743
  )
626
744
  return None, paths.pp_dedup, cfg
627
745
  if paths.pp.exists():
628
- print(f"Preprocessed AnnData already exists: {paths.pp}\nSkipping smftools load")
746
+ logger.debug(f"Preprocessed AnnData already exists: {paths.pp}\nSkipping smftools load")
629
747
  return None, paths.pp, cfg
630
748
  if paths.raw.exists():
631
- print(f"Raw AnnData from smftools load already exists: {paths.raw}\nSkipping smftools load")
749
+ logger.debug(
750
+ f"Raw AnnData from smftools load already exists: {paths.raw}\nSkipping smftools load"
751
+ )
632
752
  return None, paths.raw, cfg
633
753
 
634
754
  # If we get here, we actually want to run the full load pipeline
635
- adata, adata_path, cfg = load_adata_core(cfg, paths)
755
+ adata, adata_path, cfg = load_adata_core(cfg, paths, config_path=config_path)
636
756
 
637
- return adata, adata_path, cfg
757
+ return adata, adata_path, cfg