smftools 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. smftools/_version.py +1 -1
  2. smftools/cli/helpers.py +48 -0
  3. smftools/cli/hmm_adata.py +168 -145
  4. smftools/cli/load_adata.py +155 -95
  5. smftools/cli/preprocess_adata.py +222 -130
  6. smftools/cli/spatial_adata.py +441 -308
  7. smftools/cli_entry.py +4 -5
  8. smftools/config/conversion.yaml +12 -5
  9. smftools/config/deaminase.yaml +11 -9
  10. smftools/config/default.yaml +123 -19
  11. smftools/config/direct.yaml +3 -0
  12. smftools/config/experiment_config.py +120 -19
  13. smftools/hmm/HMM.py +12 -1
  14. smftools/hmm/__init__.py +0 -6
  15. smftools/hmm/archived/call_hmm_peaks.py +106 -0
  16. smftools/hmm/call_hmm_peaks.py +318 -90
  17. smftools/informatics/bam_functions.py +28 -29
  18. smftools/informatics/h5ad_functions.py +1 -1
  19. smftools/plotting/general_plotting.py +97 -51
  20. smftools/plotting/position_stats.py +3 -3
  21. smftools/preprocessing/__init__.py +2 -4
  22. smftools/preprocessing/append_base_context.py +34 -25
  23. smftools/preprocessing/append_binary_layer_by_base_context.py +2 -2
  24. smftools/preprocessing/binarize_on_Youden.py +10 -8
  25. smftools/preprocessing/calculate_complexity_II.py +1 -1
  26. smftools/preprocessing/calculate_coverage.py +16 -13
  27. smftools/preprocessing/calculate_position_Youden.py +41 -25
  28. smftools/preprocessing/calculate_read_modification_stats.py +1 -1
  29. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
  30. smftools/preprocessing/filter_reads_on_modification_thresholds.py +1 -1
  31. smftools/preprocessing/flag_duplicate_reads.py +1 -1
  32. smftools/preprocessing/invert_adata.py +1 -1
  33. smftools/preprocessing/load_sample_sheet.py +1 -1
  34. smftools/preprocessing/reindex_references_adata.py +37 -0
  35. smftools/readwrite.py +94 -0
  36. {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/METADATA +18 -12
  37. {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/RECORD +46 -43
  38. /smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
  39. /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
  40. /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
  41. /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
  42. /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
  43. /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
  44. {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
  45. {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/entry_points.txt +0 -0
  46. {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
@@ -2,6 +2,8 @@ import shutil
2
2
  from pathlib import Path
3
3
  from typing import Union, Iterable
4
4
 
5
+ from .helpers import AdataPaths
6
+
5
7
  def check_executable_exists(cmd: str) -> bool:
6
8
  """Return True if a command-line executable is available in PATH."""
7
9
  return shutil.which(cmd) is not None
@@ -66,117 +68,81 @@ def delete_tsvs(
66
68
  except Exception as e:
67
69
  print(f"[error] failed to remove tmp dir {td}: {e}")
68
70
 
69
- def load_adata(config_path):
71
+ def load_adata_core(cfg, paths: AdataPaths):
70
72
  """
71
- High-level function to call for converting raw sequencing data to an adata object.
72
- Command line accesses this through smftools load <config_path>
73
- Works for nanopore pod5, fast5, and unaligned modBAM data types for direct SMF workflows.
74
- Works for nanopore pod5, fast5, unaligned BAM for conversion SMF workflows.
75
- Also works for illumina fastq and unaligned BAM for conversion SMF workflows.
76
-
77
- Parameters:
78
- config_path (str): A string representing the file path to the experiment configuration csv file.
79
-
80
- Returns:
81
- adata, adata_path, se_bam_files, cfg
73
+ Core load pipeline.
74
+
75
+ Assumes:
76
+ - cfg is a fully initialized ExperimentConfig
77
+ - paths is an AdataPaths object describing canonical h5ad stage paths
78
+ - No stage-skipping or early returns based on existing AnnDatas are done here
79
+ (that happens in the wrapper).
80
+
81
+ Does:
82
+ - handle input format (fast5/pod5/fastq/bam/h5ad)
83
+ - basecalling / alignment / demux / BAM QC
84
+ - optional bed + bigwig generation
85
+ - AnnData construction (conversion or direct modality)
86
+ - basic read-level QC annotations
87
+ - write raw AnnData to paths.raw
88
+ - run MultiQC
89
+ - optional deletion of intermediate BAMs
90
+
91
+ Returns
92
+ -------
93
+ raw_adata : anndata.AnnData
94
+ Newly created raw AnnData object.
95
+ raw_adata_path : Path
96
+ Path where the raw AnnData was written (paths.raw).
97
+ cfg : ExperimentConfig
98
+ (Same object, possibly with some fields updated, e.g. fasta path.)
82
99
  """
83
- from ..readwrite import make_dirs, safe_write_h5ad, add_or_update_column_in_csv
84
- from ..config import LoadExperimentConfig, ExperimentConfig
85
- from ..informatics.bam_functions import concatenate_fastqs_to_bam
86
- from ..informatics.pod5_functions import fast5_to_pod5
87
- from ..informatics.fasta_functions import subsample_fasta_from_bed
100
+ import os
101
+ from pathlib import Path
88
102
 
89
103
  import numpy as np
90
104
  import pandas as pd
91
105
  import anndata as ad
92
106
  import scanpy as sc
93
107
 
94
- import os
95
- from importlib import resources
96
- from pathlib import Path
97
-
98
- from datetime import datetime
99
- date_str = datetime.today().strftime("%y%m%d")
100
-
101
- ################################### 1) General params and input organization ###################################
102
-
103
- # Load experiment config parameters into global variables
104
- loader = LoadExperimentConfig(config_path)
105
- defaults_dir = resources.files("smftools").joinpath("config")
106
- cfg, report = ExperimentConfig.from_var_dict(loader.var_dict, date_str=date_str, defaults_dir=defaults_dir)
107
-
108
- # Make initial output directory
109
- make_dirs([cfg.output_directory])
110
-
111
- # Make a csv that contains experiment summary file paths
112
- add_or_update_column_in_csv(cfg.summary_file, "experiment_name", cfg.experiment_name)
113
- add_or_update_column_in_csv(cfg.summary_file, "config_path", config_path)
114
- add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
115
- add_or_update_column_in_csv(cfg.summary_file, "input_files", [cfg.input_files])
116
-
117
- # Initial h5ad file naming
118
- h5_dir = cfg.output_directory / 'h5ads'
119
- raw_adata_path = h5_dir / f'{cfg.experiment_name}.h5ad.gz'
108
+ from .helpers import write_gz_h5ad
120
109
 
121
- # Preprocessed adata path info
122
- pp_adata_basename = raw_adata_path.name.split(".")[0] + '_preprocessed.h5ad.gz'
123
- pp_adata_path = raw_adata_path.parent / pp_adata_basename
110
+ from ..readwrite import make_dirs, add_or_update_column_in_csv
124
111
 
125
- # Preprocessed duplicate removed adata path info
126
- if cfg.smf_modality == 'direct':
127
- # For direct SMF, link the duplicate removed version just to the preprocessed version, since there is not a duplicate removal step for direct workflow
128
- pp_dup_rem_adata_path = pp_adata_path
129
- else:
130
- pp_dup_rem_adata_basename = pp_adata_path.name.split(".")[0] + '_duplicates_removed.h5ad.gz'
131
- pp_dup_rem_adata_path = pp_adata_path.parent / pp_dup_rem_adata_basename
132
-
133
- # Preprocessed duplicate removed adata with basic analyses appended path info
134
- spatial_adata_basename = pp_dup_rem_adata_path.name.split(".")[0] + '_spatial.h5ad.gz'
135
- spatial_adata_path = pp_dup_rem_adata_path.parent / spatial_adata_basename
136
-
137
- # hmm adata
138
- hmm_adata_basename = spatial_adata_path.name.split(".")[0] + '_hmm.h5ad.gz'
139
- hmm_adata_path = spatial_adata_path.parent / hmm_adata_basename
112
+ from ..informatics.bam_functions import concatenate_fastqs_to_bam, align_and_sort_BAM, demux_and_index_BAM, split_and_index_BAM, bam_qc, extract_read_features_from_bam
113
+ from ..informatics.bed_functions import aligned_BAM_to_bed
114
+ from ..informatics.pod5_functions import fast5_to_pod5
115
+ from ..informatics.fasta_functions import subsample_fasta_from_bed, generate_converted_FASTA, get_chromosome_lengths
116
+ from ..informatics.basecalling import modcall, canoncall
117
+ from ..informatics.modkit_functions import modQC, make_modbed, extract_mods
118
+ from ..informatics.modkit_extract_to_adata import modkit_extract_to_adata
119
+ from ..informatics.converted_BAM_to_adata import converted_BAM_to_adata
120
+ from ..informatics.h5ad_functions import add_read_length_and_mapping_qc
121
+ from ..informatics.run_multiqc import run_multiqc
140
122
 
141
- add_or_update_column_in_csv(cfg.summary_file, "load_adata", raw_adata_path)
142
- add_or_update_column_in_csv(cfg.summary_file, "pp_adata", pp_adata_path)
143
- add_or_update_column_in_csv(cfg.summary_file, "pp_dedup_adata", pp_dup_rem_adata_path)
144
- add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", spatial_adata_path)
145
- add_or_update_column_in_csv(cfg.summary_file, "hmm_adata", hmm_adata_path)
123
+ ################################### 1) General params and input organization ###################################
124
+ output_directory = Path(cfg.output_directory)
125
+ make_dirs([output_directory])
146
126
 
147
- if cfg.force_redo_load_adata:
148
- pass
149
- elif hmm_adata_path.exists():
150
- print(f"HMM AnnData already exists: {hmm_adata_path}\n Skipping smftools load")
151
- return None, hmm_adata_path, cfg
152
- elif spatial_adata_path.exists():
153
- print(f"Spatial AnnData already exists: {spatial_adata_path}\n Skipping smftools load")
154
- return None, spatial_adata_path, cfg
155
- elif pp_dup_rem_adata_path.exists():
156
- print(f"Preprocessed deduplicated AnnData already exists: {pp_dup_rem_adata_path}\n Skipping smftools load")
157
- return None, pp_dup_rem_adata_path, cfg
158
- elif pp_adata_path.exists():
159
- print(f"Preprocessed Anndata already exists: {pp_adata_path}\n Skipping smftools load")
160
- return None, pp_adata_path, cfg
161
- elif raw_adata_path.exists():
162
- print(f"Anndata from smftools load already exists: {raw_adata_path}\n Skipping smftools load")
163
- return None, raw_adata_path, cfg
164
- else:
165
- pass
127
+ raw_adata_path = paths.raw
128
+ pp_adata_path = paths.pp
129
+ pp_dup_rem_adata_path = paths.pp_dedup
130
+ spatial_adata_path = paths.spatial
131
+ hmm_adata_path = paths.hmm
166
132
 
167
133
  # Naming of the demultiplexed output directory
168
134
  double_barcoded_path = cfg.split_path / "both_ends_barcoded"
169
135
  single_barcoded_path = cfg.split_path / "at_least_one_end_barcoded"
170
136
 
171
137
  # Direct methylation detection SMF specific parameters
172
- if cfg.smf_modality == 'direct':
138
+ if cfg.smf_modality == "direct":
173
139
  mod_bed_dir = cfg.output_directory / "mod_beds"
174
140
  add_or_update_column_in_csv(cfg.summary_file, "mod_bed_dir", mod_bed_dir)
175
141
  mod_tsv_dir = cfg.output_directory / "mod_tsvs"
176
142
  add_or_update_column_in_csv(cfg.summary_file, "mod_tsv_dir", mod_tsv_dir)
177
143
  bam_qc_dir = cfg.output_directory / "bam_qc"
178
- mod_map = {'6mA': '6mA', '5mC_5hmC': '5mC'}
179
- mods = [mod_map[mod] for mod in cfg.mod_list]
144
+ mods = [cfg.mod_map[mod] for mod in cfg.mod_list]
145
+
180
146
  if not check_executable_exists("dorado"):
181
147
  raise RuntimeError(
182
148
  "Error: 'dorado' is not installed or not in PATH. "
@@ -188,9 +154,12 @@ def load_adata(config_path):
188
154
  "Install from https://github.com/nanoporetech/modkit"
189
155
  )
190
156
  else:
191
- pass
192
-
193
- if not cfg.input_already_demuxed or cfg.aligner == "dorado":
157
+ mod_bed_dir = None
158
+ mod_tsv_dir = None
159
+ mods = None
160
+
161
+ # demux / aligner executables
162
+ if (not cfg.input_already_demuxed) or cfg.aligner == "dorado":
194
163
  if not check_executable_exists("dorado"):
195
164
  raise RuntimeError(
196
165
  "Error: 'dorado' is not installed or not in PATH. "
@@ -216,7 +185,7 @@ def load_adata(config_path):
216
185
  fast5_to_pod5(cfg.input_data_path, output_pod5)
217
186
  # Reassign the pod5_dir variable to point to the new pod5 file.
218
187
  cfg.input_data_path = output_pod5
219
- cfg.input_type == "pod5"
188
+ cfg.input_type = "pod5"
220
189
  # If the input is a fastq or a directory of fastqs, concatenate them into an unaligned BAM and save the barcode
221
190
  elif cfg.input_type == "fastq":
222
191
  # Output file for FASTQ concatenation.
@@ -349,7 +318,7 @@ def load_adata(config_path):
349
318
  if aligned_sorted_output.exists():
350
319
  print(f'{aligned_sorted_output} already exists. Using existing aligned/sorted BAM.')
351
320
  else:
352
- align_and_sort_BAM(fasta, unaligned_output, cfg.bam_suffix, cfg.output_directory, cfg.make_bigwigs, cfg.threads, cfg.aligner, cfg.aligner_args)
321
+ align_and_sort_BAM(fasta, unaligned_output, cfg)
353
322
  # Deleted the unsorted aligned output
354
323
  aligned_output.unlink()
355
324
 
@@ -544,7 +513,7 @@ def load_adata(config_path):
544
513
 
545
514
  ############################################### Save final adata ###############################################
546
515
  print(f"Saving AnnData to {raw_adata_path}")
547
- safe_write_h5ad(raw_adata, raw_adata_path, compression='gzip', backup=True)
516
+ write_gz_h5ad(raw_adata, raw_adata_path)
548
517
  ########################################################################################################################
549
518
 
550
519
  ############################################### MultiQC HTML Report ###############################################
@@ -574,4 +543,95 @@ def load_adata(config_path):
574
543
  bai.unlink()
575
544
  ########################################################################################################################
576
545
 
577
- return raw_adata, raw_adata_path, cfg
546
+ return raw_adata, raw_adata_path, cfg
547
+
548
+ def load_adata(config_path: str):
549
+ """
550
+ CLI-facing wrapper for the load pipeline.
551
+
552
+ - Reads config CSV into ExperimentConfig
553
+ - Computes canonical paths for all downstream AnnData stages
554
+ - Registers those in the summary CSV
555
+ - Applies stage-skipping logic (hmm > spatial > pp_dedup > pp > raw)
556
+ - If needed, calls the core pipeline to actually build the raw AnnData
557
+
558
+ Returns
559
+ -------
560
+ adata : anndata.AnnData | None
561
+ Newly created AnnData object, or None if we skipped because a later-stage
562
+ AnnData already exists.
563
+ adata_path : pathlib.Path
564
+ Path to the "current" AnnData that should be used downstream.
565
+ cfg : ExperimentConfig
566
+ Config object for downstream steps.
567
+ """
568
+ from importlib import resources
569
+ from datetime import datetime
570
+ from pathlib import Path
571
+
572
+ import pandas as pd # used for summary file reading downstream if needed
573
+
574
+ from ..readwrite import make_dirs, add_or_update_column_in_csv
575
+ from ..config import LoadExperimentConfig, ExperimentConfig
576
+
577
+ from .helpers import get_adata_paths
578
+
579
+ date_str = datetime.today().strftime("%y%m%d")
580
+
581
+ # -----------------------------
582
+ # 1) Load config into cfg
583
+ # -----------------------------
584
+ loader = LoadExperimentConfig(config_path)
585
+ defaults_dir = resources.files("smftools").joinpath("config")
586
+ cfg, report = ExperimentConfig.from_var_dict(
587
+ loader.var_dict, date_str=date_str, defaults_dir=defaults_dir
588
+ )
589
+
590
+ # Ensure base output dir
591
+ make_dirs([cfg.output_directory])
592
+
593
+ # -----------------------------
594
+ # 2) Compute and register paths
595
+ # -----------------------------
596
+ paths = get_adata_paths(cfg)
597
+
598
+ # experiment-level metadata in summary CSV
599
+ add_or_update_column_in_csv(cfg.summary_file, "experiment_name", cfg.experiment_name)
600
+ add_or_update_column_in_csv(cfg.summary_file, "config_path", config_path)
601
+ add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
602
+ add_or_update_column_in_csv(cfg.summary_file, "input_files", [cfg.input_files])
603
+
604
+ # AnnData stage paths
605
+ add_or_update_column_in_csv(cfg.summary_file, "load_adata", paths.raw)
606
+ add_or_update_column_in_csv(cfg.summary_file, "pp_adata", paths.pp)
607
+ add_or_update_column_in_csv(cfg.summary_file, "pp_dedup_adata", paths.pp_dedup)
608
+ add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", paths.spatial)
609
+ add_or_update_column_in_csv(cfg.summary_file, "hmm_adata", paths.hmm)
610
+
611
+ # -----------------------------
612
+ # 3) Stage skipping logic
613
+ # -----------------------------
614
+ if not getattr(cfg, "force_redo_load_adata", False):
615
+ if paths.hmm.exists():
616
+ print(f"HMM AnnData already exists: {paths.hmm}\nSkipping smftools load")
617
+ return None, paths.hmm, cfg
618
+ if paths.spatial.exists():
619
+ print(f"Spatial AnnData already exists: {paths.spatial}\nSkipping smftools load")
620
+ return None, paths.spatial, cfg
621
+ if paths.pp_dedup.exists():
622
+ print(
623
+ f"Preprocessed deduplicated AnnData already exists: {paths.pp_dedup}\n"
624
+ f"Skipping smftools load"
625
+ )
626
+ return None, paths.pp_dedup, cfg
627
+ if paths.pp.exists():
628
+ print(f"Preprocessed AnnData already exists: {paths.pp}\nSkipping smftools load")
629
+ return None, paths.pp, cfg
630
+ if paths.raw.exists():
631
+ print(f"Raw AnnData from smftools load already exists: {paths.raw}\nSkipping smftools load")
632
+ return None, paths.raw, cfg
633
+
634
+ # If we get here, we actually want to run the full load pipeline
635
+ adata, adata_path, cfg = load_adata_core(cfg, paths)
636
+
637
+ return adata, adata_path, cfg