smftools 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/_version.py +1 -1
- smftools/cli/helpers.py +48 -0
- smftools/cli/hmm_adata.py +168 -145
- smftools/cli/load_adata.py +155 -95
- smftools/cli/preprocess_adata.py +222 -130
- smftools/cli/spatial_adata.py +441 -308
- smftools/cli_entry.py +4 -5
- smftools/config/conversion.yaml +12 -5
- smftools/config/deaminase.yaml +11 -9
- smftools/config/default.yaml +123 -19
- smftools/config/direct.yaml +3 -0
- smftools/config/experiment_config.py +120 -19
- smftools/hmm/HMM.py +12 -1
- smftools/hmm/__init__.py +0 -6
- smftools/hmm/archived/call_hmm_peaks.py +106 -0
- smftools/hmm/call_hmm_peaks.py +318 -90
- smftools/informatics/bam_functions.py +28 -29
- smftools/informatics/h5ad_functions.py +1 -1
- smftools/plotting/general_plotting.py +97 -51
- smftools/plotting/position_stats.py +3 -3
- smftools/preprocessing/__init__.py +2 -4
- smftools/preprocessing/append_base_context.py +34 -25
- smftools/preprocessing/append_binary_layer_by_base_context.py +2 -2
- smftools/preprocessing/binarize_on_Youden.py +10 -8
- smftools/preprocessing/calculate_complexity_II.py +1 -1
- smftools/preprocessing/calculate_coverage.py +16 -13
- smftools/preprocessing/calculate_position_Youden.py +41 -25
- smftools/preprocessing/calculate_read_modification_stats.py +1 -1
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +1 -1
- smftools/preprocessing/flag_duplicate_reads.py +1 -1
- smftools/preprocessing/invert_adata.py +1 -1
- smftools/preprocessing/load_sample_sheet.py +1 -1
- smftools/preprocessing/reindex_references_adata.py +37 -0
- smftools/readwrite.py +94 -0
- {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/METADATA +18 -12
- {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/RECORD +46 -43
- /smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
- /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
- /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
- /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
- /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
- /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
smftools/cli/load_adata.py
CHANGED
|
@@ -2,6 +2,8 @@ import shutil
|
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
from typing import Union, Iterable
|
|
4
4
|
|
|
5
|
+
from .helpers import AdataPaths
|
|
6
|
+
|
|
5
7
|
def check_executable_exists(cmd: str) -> bool:
|
|
6
8
|
"""Return True if a command-line executable is available in PATH."""
|
|
7
9
|
return shutil.which(cmd) is not None
|
|
@@ -66,117 +68,81 @@ def delete_tsvs(
|
|
|
66
68
|
except Exception as e:
|
|
67
69
|
print(f"[error] failed to remove tmp dir {td}: {e}")
|
|
68
70
|
|
|
69
|
-
def
|
|
71
|
+
def load_adata_core(cfg, paths: AdataPaths):
|
|
70
72
|
"""
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
73
|
+
Core load pipeline.
|
|
74
|
+
|
|
75
|
+
Assumes:
|
|
76
|
+
- cfg is a fully initialized ExperimentConfig
|
|
77
|
+
- paths is an AdataPaths object describing canonical h5ad stage paths
|
|
78
|
+
- No stage-skipping or early returns based on existing AnnDatas are done here
|
|
79
|
+
(that happens in the wrapper).
|
|
80
|
+
|
|
81
|
+
Does:
|
|
82
|
+
- handle input format (fast5/pod5/fastq/bam/h5ad)
|
|
83
|
+
- basecalling / alignment / demux / BAM QC
|
|
84
|
+
- optional bed + bigwig generation
|
|
85
|
+
- AnnData construction (conversion or direct modality)
|
|
86
|
+
- basic read-level QC annotations
|
|
87
|
+
- write raw AnnData to paths.raw
|
|
88
|
+
- run MultiQC
|
|
89
|
+
- optional deletion of intermediate BAMs
|
|
90
|
+
|
|
91
|
+
Returns
|
|
92
|
+
-------
|
|
93
|
+
raw_adata : anndata.AnnData
|
|
94
|
+
Newly created raw AnnData object.
|
|
95
|
+
raw_adata_path : Path
|
|
96
|
+
Path where the raw AnnData was written (paths.raw).
|
|
97
|
+
cfg : ExperimentConfig
|
|
98
|
+
(Same object, possibly with some fields updated, e.g. fasta path.)
|
|
82
99
|
"""
|
|
83
|
-
|
|
84
|
-
from
|
|
85
|
-
from ..informatics.bam_functions import concatenate_fastqs_to_bam
|
|
86
|
-
from ..informatics.pod5_functions import fast5_to_pod5
|
|
87
|
-
from ..informatics.fasta_functions import subsample_fasta_from_bed
|
|
100
|
+
import os
|
|
101
|
+
from pathlib import Path
|
|
88
102
|
|
|
89
103
|
import numpy as np
|
|
90
104
|
import pandas as pd
|
|
91
105
|
import anndata as ad
|
|
92
106
|
import scanpy as sc
|
|
93
107
|
|
|
94
|
-
import
|
|
95
|
-
from importlib import resources
|
|
96
|
-
from pathlib import Path
|
|
97
|
-
|
|
98
|
-
from datetime import datetime
|
|
99
|
-
date_str = datetime.today().strftime("%y%m%d")
|
|
100
|
-
|
|
101
|
-
################################### 1) General params and input organization ###################################
|
|
102
|
-
|
|
103
|
-
# Load experiment config parameters into global variables
|
|
104
|
-
loader = LoadExperimentConfig(config_path)
|
|
105
|
-
defaults_dir = resources.files("smftools").joinpath("config")
|
|
106
|
-
cfg, report = ExperimentConfig.from_var_dict(loader.var_dict, date_str=date_str, defaults_dir=defaults_dir)
|
|
107
|
-
|
|
108
|
-
# Make initial output directory
|
|
109
|
-
make_dirs([cfg.output_directory])
|
|
110
|
-
|
|
111
|
-
# Make a csv that contains experiment summary file paths
|
|
112
|
-
add_or_update_column_in_csv(cfg.summary_file, "experiment_name", cfg.experiment_name)
|
|
113
|
-
add_or_update_column_in_csv(cfg.summary_file, "config_path", config_path)
|
|
114
|
-
add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
|
|
115
|
-
add_or_update_column_in_csv(cfg.summary_file, "input_files", [cfg.input_files])
|
|
116
|
-
|
|
117
|
-
# Initial h5ad file naming
|
|
118
|
-
h5_dir = cfg.output_directory / 'h5ads'
|
|
119
|
-
raw_adata_path = h5_dir / f'{cfg.experiment_name}.h5ad.gz'
|
|
108
|
+
from .helpers import write_gz_h5ad
|
|
120
109
|
|
|
121
|
-
|
|
122
|
-
pp_adata_basename = raw_adata_path.name.split(".")[0] + '_preprocessed.h5ad.gz'
|
|
123
|
-
pp_adata_path = raw_adata_path.parent / pp_adata_basename
|
|
110
|
+
from ..readwrite import make_dirs, add_or_update_column_in_csv
|
|
124
111
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
spatial_adata_path = pp_dup_rem_adata_path.parent / spatial_adata_basename
|
|
136
|
-
|
|
137
|
-
# hmm adata
|
|
138
|
-
hmm_adata_basename = spatial_adata_path.name.split(".")[0] + '_hmm.h5ad.gz'
|
|
139
|
-
hmm_adata_path = spatial_adata_path.parent / hmm_adata_basename
|
|
112
|
+
from ..informatics.bam_functions import concatenate_fastqs_to_bam, align_and_sort_BAM, demux_and_index_BAM, split_and_index_BAM, bam_qc, extract_read_features_from_bam
|
|
113
|
+
from ..informatics.bed_functions import aligned_BAM_to_bed
|
|
114
|
+
from ..informatics.pod5_functions import fast5_to_pod5
|
|
115
|
+
from ..informatics.fasta_functions import subsample_fasta_from_bed, generate_converted_FASTA, get_chromosome_lengths
|
|
116
|
+
from ..informatics.basecalling import modcall, canoncall
|
|
117
|
+
from ..informatics.modkit_functions import modQC, make_modbed, extract_mods
|
|
118
|
+
from ..informatics.modkit_extract_to_adata import modkit_extract_to_adata
|
|
119
|
+
from ..informatics.converted_BAM_to_adata import converted_BAM_to_adata
|
|
120
|
+
from ..informatics.h5ad_functions import add_read_length_and_mapping_qc
|
|
121
|
+
from ..informatics.run_multiqc import run_multiqc
|
|
140
122
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", spatial_adata_path)
|
|
145
|
-
add_or_update_column_in_csv(cfg.summary_file, "hmm_adata", hmm_adata_path)
|
|
123
|
+
################################### 1) General params and input organization ###################################
|
|
124
|
+
output_directory = Path(cfg.output_directory)
|
|
125
|
+
make_dirs([output_directory])
|
|
146
126
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
elif spatial_adata_path.exists():
|
|
153
|
-
print(f"Spatial AnnData already exists: {spatial_adata_path}\n Skipping smftools load")
|
|
154
|
-
return None, spatial_adata_path, cfg
|
|
155
|
-
elif pp_dup_rem_adata_path.exists():
|
|
156
|
-
print(f"Preprocessed deduplicated AnnData already exists: {pp_dup_rem_adata_path}\n Skipping smftools load")
|
|
157
|
-
return None, pp_dup_rem_adata_path, cfg
|
|
158
|
-
elif pp_adata_path.exists():
|
|
159
|
-
print(f"Preprocessed Anndata already exists: {pp_adata_path}\n Skipping smftools load")
|
|
160
|
-
return None, pp_adata_path, cfg
|
|
161
|
-
elif raw_adata_path.exists():
|
|
162
|
-
print(f"Anndata from smftools load already exists: {raw_adata_path}\n Skipping smftools load")
|
|
163
|
-
return None, raw_adata_path, cfg
|
|
164
|
-
else:
|
|
165
|
-
pass
|
|
127
|
+
raw_adata_path = paths.raw
|
|
128
|
+
pp_adata_path = paths.pp
|
|
129
|
+
pp_dup_rem_adata_path = paths.pp_dedup
|
|
130
|
+
spatial_adata_path = paths.spatial
|
|
131
|
+
hmm_adata_path = paths.hmm
|
|
166
132
|
|
|
167
133
|
# Naming of the demultiplexed output directory
|
|
168
134
|
double_barcoded_path = cfg.split_path / "both_ends_barcoded"
|
|
169
135
|
single_barcoded_path = cfg.split_path / "at_least_one_end_barcoded"
|
|
170
136
|
|
|
171
137
|
# Direct methylation detection SMF specific parameters
|
|
172
|
-
if cfg.smf_modality ==
|
|
138
|
+
if cfg.smf_modality == "direct":
|
|
173
139
|
mod_bed_dir = cfg.output_directory / "mod_beds"
|
|
174
140
|
add_or_update_column_in_csv(cfg.summary_file, "mod_bed_dir", mod_bed_dir)
|
|
175
141
|
mod_tsv_dir = cfg.output_directory / "mod_tsvs"
|
|
176
142
|
add_or_update_column_in_csv(cfg.summary_file, "mod_tsv_dir", mod_tsv_dir)
|
|
177
143
|
bam_qc_dir = cfg.output_directory / "bam_qc"
|
|
178
|
-
|
|
179
|
-
|
|
144
|
+
mods = [cfg.mod_map[mod] for mod in cfg.mod_list]
|
|
145
|
+
|
|
180
146
|
if not check_executable_exists("dorado"):
|
|
181
147
|
raise RuntimeError(
|
|
182
148
|
"Error: 'dorado' is not installed or not in PATH. "
|
|
@@ -188,9 +154,12 @@ def load_adata(config_path):
|
|
|
188
154
|
"Install from https://github.com/nanoporetech/modkit"
|
|
189
155
|
)
|
|
190
156
|
else:
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
157
|
+
mod_bed_dir = None
|
|
158
|
+
mod_tsv_dir = None
|
|
159
|
+
mods = None
|
|
160
|
+
|
|
161
|
+
# demux / aligner executables
|
|
162
|
+
if (not cfg.input_already_demuxed) or cfg.aligner == "dorado":
|
|
194
163
|
if not check_executable_exists("dorado"):
|
|
195
164
|
raise RuntimeError(
|
|
196
165
|
"Error: 'dorado' is not installed or not in PATH. "
|
|
@@ -216,7 +185,7 @@ def load_adata(config_path):
|
|
|
216
185
|
fast5_to_pod5(cfg.input_data_path, output_pod5)
|
|
217
186
|
# Reassign the pod5_dir variable to point to the new pod5 file.
|
|
218
187
|
cfg.input_data_path = output_pod5
|
|
219
|
-
cfg.input_type
|
|
188
|
+
cfg.input_type = "pod5"
|
|
220
189
|
# If the input is a fastq or a directory of fastqs, concatenate them into an unaligned BAM and save the barcode
|
|
221
190
|
elif cfg.input_type == "fastq":
|
|
222
191
|
# Output file for FASTQ concatenation.
|
|
@@ -349,7 +318,7 @@ def load_adata(config_path):
|
|
|
349
318
|
if aligned_sorted_output.exists():
|
|
350
319
|
print(f'{aligned_sorted_output} already exists. Using existing aligned/sorted BAM.')
|
|
351
320
|
else:
|
|
352
|
-
align_and_sort_BAM(fasta, unaligned_output, cfg
|
|
321
|
+
align_and_sort_BAM(fasta, unaligned_output, cfg)
|
|
353
322
|
# Deleted the unsorted aligned output
|
|
354
323
|
aligned_output.unlink()
|
|
355
324
|
|
|
@@ -544,7 +513,7 @@ def load_adata(config_path):
|
|
|
544
513
|
|
|
545
514
|
############################################### Save final adata ###############################################
|
|
546
515
|
print(f"Saving AnnData to {raw_adata_path}")
|
|
547
|
-
|
|
516
|
+
write_gz_h5ad(raw_adata, raw_adata_path)
|
|
548
517
|
########################################################################################################################
|
|
549
518
|
|
|
550
519
|
############################################### MultiQC HTML Report ###############################################
|
|
@@ -574,4 +543,95 @@ def load_adata(config_path):
|
|
|
574
543
|
bai.unlink()
|
|
575
544
|
########################################################################################################################
|
|
576
545
|
|
|
577
|
-
return raw_adata, raw_adata_path, cfg
|
|
546
|
+
return raw_adata, raw_adata_path, cfg
|
|
547
|
+
|
|
548
|
+
def load_adata(config_path: str):
|
|
549
|
+
"""
|
|
550
|
+
CLI-facing wrapper for the load pipeline.
|
|
551
|
+
|
|
552
|
+
- Reads config CSV into ExperimentConfig
|
|
553
|
+
- Computes canonical paths for all downstream AnnData stages
|
|
554
|
+
- Registers those in the summary CSV
|
|
555
|
+
- Applies stage-skipping logic (hmm > spatial > pp_dedup > pp > raw)
|
|
556
|
+
- If needed, calls the core pipeline to actually build the raw AnnData
|
|
557
|
+
|
|
558
|
+
Returns
|
|
559
|
+
-------
|
|
560
|
+
adata : anndata.AnnData | None
|
|
561
|
+
Newly created AnnData object, or None if we skipped because a later-stage
|
|
562
|
+
AnnData already exists.
|
|
563
|
+
adata_path : pathlib.Path
|
|
564
|
+
Path to the "current" AnnData that should be used downstream.
|
|
565
|
+
cfg : ExperimentConfig
|
|
566
|
+
Config object for downstream steps.
|
|
567
|
+
"""
|
|
568
|
+
from importlib import resources
|
|
569
|
+
from datetime import datetime
|
|
570
|
+
from pathlib import Path
|
|
571
|
+
|
|
572
|
+
import pandas as pd # used for summary file reading downstream if needed
|
|
573
|
+
|
|
574
|
+
from ..readwrite import make_dirs, add_or_update_column_in_csv
|
|
575
|
+
from ..config import LoadExperimentConfig, ExperimentConfig
|
|
576
|
+
|
|
577
|
+
from .helpers import get_adata_paths
|
|
578
|
+
|
|
579
|
+
date_str = datetime.today().strftime("%y%m%d")
|
|
580
|
+
|
|
581
|
+
# -----------------------------
|
|
582
|
+
# 1) Load config into cfg
|
|
583
|
+
# -----------------------------
|
|
584
|
+
loader = LoadExperimentConfig(config_path)
|
|
585
|
+
defaults_dir = resources.files("smftools").joinpath("config")
|
|
586
|
+
cfg, report = ExperimentConfig.from_var_dict(
|
|
587
|
+
loader.var_dict, date_str=date_str, defaults_dir=defaults_dir
|
|
588
|
+
)
|
|
589
|
+
|
|
590
|
+
# Ensure base output dir
|
|
591
|
+
make_dirs([cfg.output_directory])
|
|
592
|
+
|
|
593
|
+
# -----------------------------
|
|
594
|
+
# 2) Compute and register paths
|
|
595
|
+
# -----------------------------
|
|
596
|
+
paths = get_adata_paths(cfg)
|
|
597
|
+
|
|
598
|
+
# experiment-level metadata in summary CSV
|
|
599
|
+
add_or_update_column_in_csv(cfg.summary_file, "experiment_name", cfg.experiment_name)
|
|
600
|
+
add_or_update_column_in_csv(cfg.summary_file, "config_path", config_path)
|
|
601
|
+
add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
|
|
602
|
+
add_or_update_column_in_csv(cfg.summary_file, "input_files", [cfg.input_files])
|
|
603
|
+
|
|
604
|
+
# AnnData stage paths
|
|
605
|
+
add_or_update_column_in_csv(cfg.summary_file, "load_adata", paths.raw)
|
|
606
|
+
add_or_update_column_in_csv(cfg.summary_file, "pp_adata", paths.pp)
|
|
607
|
+
add_or_update_column_in_csv(cfg.summary_file, "pp_dedup_adata", paths.pp_dedup)
|
|
608
|
+
add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", paths.spatial)
|
|
609
|
+
add_or_update_column_in_csv(cfg.summary_file, "hmm_adata", paths.hmm)
|
|
610
|
+
|
|
611
|
+
# -----------------------------
|
|
612
|
+
# 3) Stage skipping logic
|
|
613
|
+
# -----------------------------
|
|
614
|
+
if not getattr(cfg, "force_redo_load_adata", False):
|
|
615
|
+
if paths.hmm.exists():
|
|
616
|
+
print(f"HMM AnnData already exists: {paths.hmm}\nSkipping smftools load")
|
|
617
|
+
return None, paths.hmm, cfg
|
|
618
|
+
if paths.spatial.exists():
|
|
619
|
+
print(f"Spatial AnnData already exists: {paths.spatial}\nSkipping smftools load")
|
|
620
|
+
return None, paths.spatial, cfg
|
|
621
|
+
if paths.pp_dedup.exists():
|
|
622
|
+
print(
|
|
623
|
+
f"Preprocessed deduplicated AnnData already exists: {paths.pp_dedup}\n"
|
|
624
|
+
f"Skipping smftools load"
|
|
625
|
+
)
|
|
626
|
+
return None, paths.pp_dedup, cfg
|
|
627
|
+
if paths.pp.exists():
|
|
628
|
+
print(f"Preprocessed AnnData already exists: {paths.pp}\nSkipping smftools load")
|
|
629
|
+
return None, paths.pp, cfg
|
|
630
|
+
if paths.raw.exists():
|
|
631
|
+
print(f"Raw AnnData from smftools load already exists: {paths.raw}\nSkipping smftools load")
|
|
632
|
+
return None, paths.raw, cfg
|
|
633
|
+
|
|
634
|
+
# If we get here, we actually want to run the full load pipeline
|
|
635
|
+
adata, adata_path, cfg = load_adata_core(cfg, paths)
|
|
636
|
+
|
|
637
|
+
return adata, adata_path, cfg
|