smftools 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +43 -13
- smftools/_settings.py +6 -6
- smftools/_version.py +3 -1
- smftools/cli/__init__.py +1 -0
- smftools/cli/archived/cli_flows.py +2 -0
- smftools/cli/helpers.py +9 -1
- smftools/cli/hmm_adata.py +905 -242
- smftools/cli/load_adata.py +432 -280
- smftools/cli/preprocess_adata.py +287 -171
- smftools/cli/spatial_adata.py +141 -53
- smftools/cli_entry.py +119 -178
- smftools/config/__init__.py +3 -1
- smftools/config/conversion.yaml +5 -1
- smftools/config/deaminase.yaml +1 -1
- smftools/config/default.yaml +26 -18
- smftools/config/direct.yaml +8 -3
- smftools/config/discover_input_files.py +19 -5
- smftools/config/experiment_config.py +511 -276
- smftools/constants.py +37 -0
- smftools/datasets/__init__.py +4 -8
- smftools/datasets/datasets.py +32 -18
- smftools/hmm/HMM.py +2133 -1428
- smftools/hmm/__init__.py +24 -14
- smftools/hmm/archived/apply_hmm_batched.py +2 -0
- smftools/hmm/archived/calculate_distances.py +2 -0
- smftools/hmm/archived/call_hmm_peaks.py +18 -1
- smftools/hmm/archived/train_hmm.py +2 -0
- smftools/hmm/call_hmm_peaks.py +176 -193
- smftools/hmm/display_hmm.py +23 -7
- smftools/hmm/hmm_readwrite.py +20 -6
- smftools/hmm/nucleosome_hmm_refinement.py +104 -14
- smftools/informatics/__init__.py +55 -13
- smftools/informatics/archived/bam_conversion.py +2 -0
- smftools/informatics/archived/bam_direct.py +2 -0
- smftools/informatics/archived/basecall_pod5s.py +2 -0
- smftools/informatics/archived/basecalls_to_adata.py +2 -0
- smftools/informatics/archived/conversion_smf.py +2 -0
- smftools/informatics/archived/deaminase_smf.py +1 -0
- smftools/informatics/archived/direct_smf.py +2 -0
- smftools/informatics/archived/fast5_to_pod5.py +2 -0
- smftools/informatics/archived/helpers/archived/__init__.py +2 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +16 -1
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
- smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
- smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
- smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
- smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
- smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
- smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
- smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
- smftools/informatics/archived/helpers/archived/informatics.py +2 -0
- smftools/informatics/archived/helpers/archived/load_adata.py +5 -3
- smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
- smftools/informatics/archived/helpers/archived/modQC.py +2 -0
- smftools/informatics/archived/helpers/archived/modcall.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +5 -1
- smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
- smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
- smftools/informatics/archived/print_bam_query_seq.py +9 -1
- smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
- smftools/informatics/archived/subsample_pod5.py +2 -0
- smftools/informatics/bam_functions.py +1059 -269
- smftools/informatics/basecalling.py +53 -9
- smftools/informatics/bed_functions.py +357 -114
- smftools/informatics/binarize_converted_base_identities.py +21 -7
- smftools/informatics/complement_base_list.py +9 -6
- smftools/informatics/converted_BAM_to_adata.py +324 -137
- smftools/informatics/fasta_functions.py +251 -89
- smftools/informatics/h5ad_functions.py +202 -30
- smftools/informatics/modkit_extract_to_adata.py +623 -274
- smftools/informatics/modkit_functions.py +87 -44
- smftools/informatics/ohe.py +46 -21
- smftools/informatics/pod5_functions.py +114 -74
- smftools/informatics/run_multiqc.py +20 -14
- smftools/logging_utils.py +51 -0
- smftools/machine_learning/__init__.py +23 -12
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +157 -50
- smftools/machine_learning/data/preprocessing.py +4 -1
- smftools/machine_learning/evaluation/__init__.py +3 -1
- smftools/machine_learning/evaluation/eval_utils.py +13 -14
- smftools/machine_learning/evaluation/evaluators.py +52 -34
- smftools/machine_learning/inference/__init__.py +3 -1
- smftools/machine_learning/inference/inference_utils.py +9 -4
- smftools/machine_learning/inference/lightning_inference.py +14 -13
- smftools/machine_learning/inference/sklearn_inference.py +8 -8
- smftools/machine_learning/inference/sliding_window_inference.py +37 -25
- smftools/machine_learning/models/__init__.py +12 -5
- smftools/machine_learning/models/base.py +34 -43
- smftools/machine_learning/models/cnn.py +22 -13
- smftools/machine_learning/models/lightning_base.py +78 -42
- smftools/machine_learning/models/mlp.py +18 -5
- smftools/machine_learning/models/positional.py +10 -4
- smftools/machine_learning/models/rnn.py +8 -3
- smftools/machine_learning/models/sklearn_models.py +46 -24
- smftools/machine_learning/models/transformer.py +75 -55
- smftools/machine_learning/models/wrappers.py +8 -3
- smftools/machine_learning/training/__init__.py +4 -2
- smftools/machine_learning/training/train_lightning_model.py +42 -23
- smftools/machine_learning/training/train_sklearn_model.py +11 -15
- smftools/machine_learning/utils/__init__.py +3 -1
- smftools/machine_learning/utils/device.py +12 -5
- smftools/machine_learning/utils/grl.py +8 -2
- smftools/metadata.py +443 -0
- smftools/optional_imports.py +31 -0
- smftools/plotting/__init__.py +32 -17
- smftools/plotting/autocorrelation_plotting.py +153 -48
- smftools/plotting/classifiers.py +175 -73
- smftools/plotting/general_plotting.py +350 -168
- smftools/plotting/hmm_plotting.py +53 -14
- smftools/plotting/position_stats.py +155 -87
- smftools/plotting/qc_plotting.py +25 -12
- smftools/preprocessing/__init__.py +35 -37
- smftools/preprocessing/append_base_context.py +105 -79
- smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
- smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +2 -0
- smftools/preprocessing/{archives → archived}/calculate_complexity.py +5 -1
- smftools/preprocessing/{archives → archived}/mark_duplicates.py +2 -0
- smftools/preprocessing/{archives → archived}/preprocessing.py +10 -6
- smftools/preprocessing/{archives → archived}/remove_duplicates.py +2 -0
- smftools/preprocessing/binarize.py +21 -4
- smftools/preprocessing/binarize_on_Youden.py +127 -31
- smftools/preprocessing/binary_layers_to_ohe.py +18 -11
- smftools/preprocessing/calculate_complexity_II.py +89 -59
- smftools/preprocessing/calculate_consensus.py +28 -19
- smftools/preprocessing/calculate_coverage.py +44 -22
- smftools/preprocessing/calculate_pairwise_differences.py +4 -1
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +7 -3
- smftools/preprocessing/calculate_position_Youden.py +110 -55
- smftools/preprocessing/calculate_read_length_stats.py +52 -23
- smftools/preprocessing/calculate_read_modification_stats.py +91 -57
- smftools/preprocessing/clean_NaN.py +38 -28
- smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +72 -37
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +183 -73
- smftools/preprocessing/flag_duplicate_reads.py +708 -303
- smftools/preprocessing/invert_adata.py +26 -11
- smftools/preprocessing/load_sample_sheet.py +40 -22
- smftools/preprocessing/make_dirs.py +9 -3
- smftools/preprocessing/min_non_diagonal.py +4 -1
- smftools/preprocessing/recipes.py +58 -23
- smftools/preprocessing/reindex_references_adata.py +93 -27
- smftools/preprocessing/subsample_adata.py +33 -16
- smftools/readwrite.py +264 -109
- smftools/schema/__init__.py +11 -0
- smftools/schema/anndata_schema_v1.yaml +227 -0
- smftools/tools/__init__.py +25 -18
- smftools/tools/archived/apply_hmm.py +2 -0
- smftools/tools/archived/classifiers.py +165 -0
- smftools/tools/archived/classify_methylated_features.py +2 -0
- smftools/tools/archived/classify_non_methylated_features.py +2 -0
- smftools/tools/archived/subset_adata_v1.py +12 -1
- smftools/tools/archived/subset_adata_v2.py +14 -1
- smftools/tools/calculate_umap.py +56 -15
- smftools/tools/cluster_adata_on_methylation.py +122 -47
- smftools/tools/general_tools.py +70 -25
- smftools/tools/position_stats.py +220 -99
- smftools/tools/read_stats.py +50 -29
- smftools/tools/spatial_autocorrelation.py +365 -192
- smftools/tools/subset_adata.py +23 -21
- smftools-0.3.0.dist-info/METADATA +147 -0
- smftools-0.3.0.dist-info/RECORD +182 -0
- smftools-0.2.4.dist-info/METADATA +0 -141
- smftools-0.2.4.dist-info/RECORD +0 -176
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0
smftools/cli/load_adata.py
CHANGED
|
@@ -1,13 +1,23 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import shutil
|
|
2
4
|
from pathlib import Path
|
|
3
|
-
from typing import
|
|
5
|
+
from typing import Iterable, Union
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
from smftools.logging_utils import get_logger
|
|
4
10
|
|
|
5
11
|
from .helpers import AdataPaths
|
|
6
12
|
|
|
13
|
+
logger = get_logger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
7
16
|
def check_executable_exists(cmd: str) -> bool:
|
|
8
17
|
"""Return True if a command-line executable is available in PATH."""
|
|
9
18
|
return shutil.which(cmd) is not None
|
|
10
19
|
|
|
20
|
+
|
|
11
21
|
def delete_tsvs(
|
|
12
22
|
tsv_dir: Union[str, Path, Iterable[str], None],
|
|
13
23
|
*,
|
|
@@ -27,48 +37,140 @@ def delete_tsvs(
|
|
|
27
37
|
verbose : bool
|
|
28
38
|
Print progress / warnings.
|
|
29
39
|
"""
|
|
40
|
+
|
|
30
41
|
# Helper: remove a single file path (Path-like or string)
|
|
31
42
|
def _maybe_unlink(p: Path):
|
|
32
43
|
if not p.exists():
|
|
33
44
|
if verbose:
|
|
34
|
-
|
|
45
|
+
logger.info(f"[skip] not found: {p}")
|
|
35
46
|
return
|
|
36
47
|
if not p.is_file():
|
|
37
48
|
if verbose:
|
|
38
|
-
|
|
49
|
+
logger.info(f"[skip] not a file: {p}")
|
|
39
50
|
return
|
|
40
51
|
if dry_run:
|
|
41
|
-
|
|
52
|
+
logger.info(f"[dry-run] would remove file: {p}")
|
|
42
53
|
return
|
|
43
54
|
try:
|
|
44
55
|
p.unlink()
|
|
45
56
|
if verbose:
|
|
46
|
-
|
|
57
|
+
logger.info(f"Removed file: {p}")
|
|
47
58
|
except Exception as e:
|
|
48
|
-
|
|
59
|
+
logger.warning(f"Failed to remove file {p}: {e}")
|
|
49
60
|
|
|
50
61
|
# Remove tmp_dir recursively (if provided)
|
|
51
62
|
if tsv_dir is not None:
|
|
52
63
|
td = Path(tsv_dir)
|
|
53
64
|
if not td.exists():
|
|
54
65
|
if verbose:
|
|
55
|
-
|
|
66
|
+
logger.info(f"[skip] tsv_dir not found: {td}")
|
|
56
67
|
else:
|
|
57
68
|
if not td.is_dir():
|
|
58
69
|
if verbose:
|
|
59
|
-
|
|
70
|
+
logger.info(f"[skip] tsv_dir is not a directory: {td}")
|
|
60
71
|
else:
|
|
61
72
|
if dry_run:
|
|
62
|
-
|
|
73
|
+
logger.info(f"[dry-run] would remove directory tree: {td}")
|
|
63
74
|
else:
|
|
64
75
|
try:
|
|
65
76
|
shutil.rmtree(td)
|
|
66
77
|
if verbose:
|
|
67
|
-
|
|
78
|
+
logger.info(f"Removed directory tree: {td}")
|
|
68
79
|
except Exception as e:
|
|
69
|
-
|
|
80
|
+
logger.warning(f"[error] failed to remove tmp dir {td}: {e}")
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def load_adata(config_path: str):
|
|
84
|
+
"""
|
|
85
|
+
CLI-facing wrapper for the load pipeline.
|
|
86
|
+
|
|
87
|
+
- Reads config CSV into ExperimentConfig
|
|
88
|
+
- Computes canonical paths for all downstream AnnData stages
|
|
89
|
+
- Registers those in the summary CSV
|
|
90
|
+
- Applies stage-skipping logic (hmm > spatial > pp_dedup > pp > raw)
|
|
91
|
+
- If needed, calls the core pipeline to actually build the raw AnnData
|
|
70
92
|
|
|
71
|
-
|
|
93
|
+
Returns
|
|
94
|
+
-------
|
|
95
|
+
adata : anndata.AnnData | None
|
|
96
|
+
Newly created AnnData object, or None if we skipped because a later-stage
|
|
97
|
+
AnnData already exists.
|
|
98
|
+
adata_path : pathlib.Path
|
|
99
|
+
Path to the "current" AnnData that should be used downstream.
|
|
100
|
+
cfg : ExperimentConfig
|
|
101
|
+
Config object for downstream steps.
|
|
102
|
+
"""
|
|
103
|
+
from datetime import datetime
|
|
104
|
+
from importlib import resources
|
|
105
|
+
|
|
106
|
+
from ..config import ExperimentConfig, LoadExperimentConfig
|
|
107
|
+
from ..readwrite import add_or_update_column_in_csv, make_dirs
|
|
108
|
+
from .helpers import get_adata_paths
|
|
109
|
+
|
|
110
|
+
date_str = datetime.today().strftime("%y%m%d")
|
|
111
|
+
|
|
112
|
+
# -----------------------------
|
|
113
|
+
# 1) Load config into cfg
|
|
114
|
+
# -----------------------------
|
|
115
|
+
loader = LoadExperimentConfig(config_path)
|
|
116
|
+
defaults_dir = resources.files("smftools").joinpath("config")
|
|
117
|
+
cfg, report = ExperimentConfig.from_var_dict(
|
|
118
|
+
loader.var_dict, date_str=date_str, defaults_dir=defaults_dir
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
# Ensure base output dir
|
|
122
|
+
make_dirs([cfg.output_directory])
|
|
123
|
+
|
|
124
|
+
# -----------------------------
|
|
125
|
+
# 2) Compute and register paths
|
|
126
|
+
# -----------------------------
|
|
127
|
+
paths = get_adata_paths(cfg)
|
|
128
|
+
|
|
129
|
+
# experiment-level metadata in summary CSV
|
|
130
|
+
add_or_update_column_in_csv(cfg.summary_file, "experiment_name", cfg.experiment_name)
|
|
131
|
+
add_or_update_column_in_csv(cfg.summary_file, "config_path", config_path)
|
|
132
|
+
add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
|
|
133
|
+
add_or_update_column_in_csv(cfg.summary_file, "input_files", [cfg.input_files])
|
|
134
|
+
|
|
135
|
+
# AnnData stage paths
|
|
136
|
+
add_or_update_column_in_csv(cfg.summary_file, "load_adata", paths.raw)
|
|
137
|
+
add_or_update_column_in_csv(cfg.summary_file, "pp_adata", paths.pp)
|
|
138
|
+
add_or_update_column_in_csv(cfg.summary_file, "pp_dedup_adata", paths.pp_dedup)
|
|
139
|
+
add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", paths.spatial)
|
|
140
|
+
add_or_update_column_in_csv(cfg.summary_file, "hmm_adata", paths.hmm)
|
|
141
|
+
|
|
142
|
+
# -----------------------------
|
|
143
|
+
# 3) Stage skipping logic
|
|
144
|
+
# -----------------------------
|
|
145
|
+
if not getattr(cfg, "force_redo_load_adata", False):
|
|
146
|
+
if paths.hmm.exists():
|
|
147
|
+
logger.debug(f"HMM AnnData already exists: {paths.hmm}\nSkipping smftools load")
|
|
148
|
+
return None, paths.hmm, cfg
|
|
149
|
+
if paths.spatial.exists():
|
|
150
|
+
logger.debug(f"Spatial AnnData already exists: {paths.spatial}\nSkipping smftools load")
|
|
151
|
+
return None, paths.spatial, cfg
|
|
152
|
+
if paths.pp_dedup.exists():
|
|
153
|
+
logger.debug(
|
|
154
|
+
f"Preprocessed deduplicated AnnData already exists: {paths.pp_dedup}\n"
|
|
155
|
+
f"Skipping smftools load"
|
|
156
|
+
)
|
|
157
|
+
return None, paths.pp_dedup, cfg
|
|
158
|
+
if paths.pp.exists():
|
|
159
|
+
logger.debug(f"Preprocessed AnnData already exists: {paths.pp}\nSkipping smftools load")
|
|
160
|
+
return None, paths.pp, cfg
|
|
161
|
+
if paths.raw.exists():
|
|
162
|
+
logger.debug(
|
|
163
|
+
f"Raw AnnData from smftools load already exists: {paths.raw}\nSkipping smftools load"
|
|
164
|
+
)
|
|
165
|
+
return None, paths.raw, cfg
|
|
166
|
+
|
|
167
|
+
# If we get here, we actually want to run the full load pipeline
|
|
168
|
+
adata, adata_path, cfg = load_adata_core(cfg, paths, config_path=config_path)
|
|
169
|
+
|
|
170
|
+
return adata, adata_path, cfg
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
72
174
|
"""
|
|
73
175
|
Core load pipeline.
|
|
74
176
|
|
|
@@ -97,28 +199,31 @@ def load_adata_core(cfg, paths: AdataPaths):
|
|
|
97
199
|
cfg : ExperimentConfig
|
|
98
200
|
(Same object, possibly with some fields updated, e.g. fasta path.)
|
|
99
201
|
"""
|
|
100
|
-
import os
|
|
101
|
-
from pathlib import Path
|
|
102
|
-
|
|
103
|
-
import numpy as np
|
|
104
|
-
import pandas as pd
|
|
105
|
-
import anndata as ad
|
|
106
|
-
import scanpy as sc
|
|
107
202
|
|
|
108
|
-
from .
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
203
|
+
from ..informatics.bam_functions import (
|
|
204
|
+
align_and_sort_BAM,
|
|
205
|
+
bam_qc,
|
|
206
|
+
concatenate_fastqs_to_bam,
|
|
207
|
+
demux_and_index_BAM,
|
|
208
|
+
extract_read_features_from_bam,
|
|
209
|
+
split_and_index_BAM,
|
|
210
|
+
)
|
|
211
|
+
from ..informatics.basecalling import canoncall, modcall
|
|
113
212
|
from ..informatics.bed_functions import aligned_BAM_to_bed
|
|
114
|
-
from ..informatics.pod5_functions import fast5_to_pod5
|
|
115
|
-
from ..informatics.fasta_functions import subsample_fasta_from_bed, generate_converted_FASTA, get_chromosome_lengths
|
|
116
|
-
from ..informatics.basecalling import modcall, canoncall
|
|
117
|
-
from ..informatics.modkit_functions import modQC, make_modbed, extract_mods
|
|
118
|
-
from ..informatics.modkit_extract_to_adata import modkit_extract_to_adata
|
|
119
213
|
from ..informatics.converted_BAM_to_adata import converted_BAM_to_adata
|
|
214
|
+
from ..informatics.fasta_functions import (
|
|
215
|
+
generate_converted_FASTA,
|
|
216
|
+
get_chromosome_lengths,
|
|
217
|
+
subsample_fasta_from_bed,
|
|
218
|
+
)
|
|
120
219
|
from ..informatics.h5ad_functions import add_read_length_and_mapping_qc
|
|
220
|
+
from ..informatics.modkit_extract_to_adata import modkit_extract_to_adata
|
|
221
|
+
from ..informatics.modkit_functions import extract_mods, make_modbed, modQC
|
|
222
|
+
from ..informatics.pod5_functions import fast5_to_pod5
|
|
121
223
|
from ..informatics.run_multiqc import run_multiqc
|
|
224
|
+
from ..metadata import record_smftools_metadata
|
|
225
|
+
from ..readwrite import add_or_update_column_in_csv, make_dirs
|
|
226
|
+
from .helpers import write_gz_h5ad
|
|
122
227
|
|
|
123
228
|
################################### 1) General params and input organization ###################################
|
|
124
229
|
output_directory = Path(cfg.output_directory)
|
|
@@ -169,19 +274,20 @@ def load_adata_core(cfg, paths: AdataPaths):
|
|
|
169
274
|
if cfg.aligner == "minimap2":
|
|
170
275
|
if not check_executable_exists("minimap2"):
|
|
171
276
|
raise RuntimeError(
|
|
172
|
-
"Error: 'minimap2' is not installed or not in PATH. "
|
|
173
|
-
"Install minimap2"
|
|
277
|
+
"Error: 'minimap2' is not installed or not in PATH. Install minimap2"
|
|
174
278
|
)
|
|
175
279
|
|
|
176
280
|
# # Detect the input filetypes
|
|
177
281
|
# If the input files are fast5 files, convert the files to a pod5 file before proceeding.
|
|
178
282
|
if cfg.input_type == "fast5":
|
|
179
283
|
# take the input directory of fast5 files and write out a single pod5 file into the output directory.
|
|
180
|
-
output_pod5 = cfg.output_directory /
|
|
284
|
+
output_pod5 = cfg.output_directory / "FAST5s_to_POD5.pod5"
|
|
181
285
|
if output_pod5.exists():
|
|
182
286
|
pass
|
|
183
287
|
else:
|
|
184
|
-
|
|
288
|
+
logger.info(
|
|
289
|
+
f"Input directory contains fast5 files, converting them and concatenating into a single pod5 file in the {output_pod5}"
|
|
290
|
+
)
|
|
185
291
|
fast5_to_pod5(cfg.input_data_path, output_pod5)
|
|
186
292
|
# Reassign the pod5_dir variable to point to the new pod5 file.
|
|
187
293
|
cfg.input_data_path = output_pod5
|
|
@@ -189,22 +295,25 @@ def load_adata_core(cfg, paths: AdataPaths):
|
|
|
189
295
|
# If the input is a fastq or a directory of fastqs, concatenate them into an unaligned BAM and save the barcode
|
|
190
296
|
elif cfg.input_type == "fastq":
|
|
191
297
|
# Output file for FASTQ concatenation.
|
|
192
|
-
output_bam = cfg.output_directory /
|
|
298
|
+
output_bam = cfg.output_directory / "canonical_basecalls.bam"
|
|
193
299
|
if output_bam.exists():
|
|
194
|
-
|
|
300
|
+
logger.debug("Output BAM already exists")
|
|
195
301
|
else:
|
|
302
|
+
logger.info("Concatenating FASTQ files into a single BAM file")
|
|
196
303
|
summary = concatenate_fastqs_to_bam(
|
|
197
304
|
cfg.input_files,
|
|
198
305
|
output_bam,
|
|
199
|
-
barcode_tag=
|
|
200
|
-
gzip_suffixes=(
|
|
306
|
+
barcode_tag="BC",
|
|
307
|
+
gzip_suffixes=(".gz", ".gzip"),
|
|
201
308
|
barcode_map=cfg.fastq_barcode_map,
|
|
202
309
|
add_read_group=True,
|
|
203
310
|
rg_sample_field=None,
|
|
204
311
|
progress=False,
|
|
205
|
-
auto_pair=cfg.fastq_auto_pairing
|
|
206
|
-
|
|
207
|
-
|
|
312
|
+
auto_pair=cfg.fastq_auto_pairing,
|
|
313
|
+
samtools_backend=cfg.samtools_backend,
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
logger.info(f"Found the following barcodes in FASTQ inputs: {summary['barcodes']}")
|
|
208
317
|
|
|
209
318
|
# Set the input data path to the concatenated BAM.
|
|
210
319
|
cfg.input_data_path = output_bam
|
|
@@ -213,24 +322,24 @@ def load_adata_core(cfg, paths: AdataPaths):
|
|
|
213
322
|
pass
|
|
214
323
|
else:
|
|
215
324
|
pass
|
|
216
|
-
|
|
325
|
+
|
|
217
326
|
add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
|
|
218
327
|
|
|
219
328
|
# Determine if the input data needs to be basecalled
|
|
220
329
|
if cfg.input_type == "pod5":
|
|
221
|
-
|
|
330
|
+
logger.info(f"Detected pod5 inputs: {cfg.input_files}")
|
|
222
331
|
basecall = True
|
|
223
332
|
elif cfg.input_type in ["bam"]:
|
|
224
|
-
|
|
333
|
+
logger.info(f"Detected bam input: {cfg.input_files}")
|
|
225
334
|
basecall = False
|
|
226
335
|
else:
|
|
227
|
-
|
|
336
|
+
logger.info("Error, can not find input bam or pod5")
|
|
228
337
|
|
|
229
338
|
# Generate the base name of the unaligned bam without the .bam suffix
|
|
230
339
|
if basecall:
|
|
231
340
|
model_basename = Path(cfg.model).name
|
|
232
|
-
model_basename = str(model_basename).replace(
|
|
233
|
-
if cfg.smf_modality ==
|
|
341
|
+
model_basename = str(model_basename).replace(".", "_")
|
|
342
|
+
if cfg.smf_modality == "direct":
|
|
234
343
|
mod_string = "_".join(cfg.mod_list)
|
|
235
344
|
bam = cfg.output_directory / f"{model_basename}_{mod_string}_calls"
|
|
236
345
|
else:
|
|
@@ -241,7 +350,9 @@ def load_adata_core(cfg, paths: AdataPaths):
|
|
|
241
350
|
|
|
242
351
|
# Generate path names for the unaligned, aligned, as well as the aligned/sorted bam.
|
|
243
352
|
unaligned_output = bam.with_suffix(cfg.bam_suffix)
|
|
244
|
-
aligned_BAM =
|
|
353
|
+
aligned_BAM = (
|
|
354
|
+
cfg.output_directory / (bam.stem + "_aligned")
|
|
355
|
+
) # doing this allows specifying an input bam in a seperate directory as the aligned output bams
|
|
245
356
|
aligned_output = aligned_BAM.with_suffix(cfg.bam_suffix)
|
|
246
357
|
aligned_sorted_BAM = aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
|
|
247
358
|
aligned_sorted_output = aligned_sorted_BAM.with_suffix(cfg.bam_suffix)
|
|
@@ -252,34 +363,40 @@ def load_adata_core(cfg, paths: AdataPaths):
|
|
|
252
363
|
########################################################################################################################
|
|
253
364
|
|
|
254
365
|
################################### 2) FASTA Handling ###################################
|
|
255
|
-
from ..informatics.fasta_functions import generate_converted_FASTA, get_chromosome_lengths
|
|
256
366
|
|
|
257
367
|
try:
|
|
258
368
|
cfg.fasta = Path(cfg.fasta)
|
|
259
|
-
except:
|
|
260
|
-
|
|
369
|
+
except Exception:
|
|
370
|
+
logger.warning("Need to provide an input FASTA path to proceed with smftools load")
|
|
261
371
|
|
|
262
372
|
# If fasta_regions_of_interest bed is passed, subsample the input FASTA on regions of interest and use the subsampled FASTA.
|
|
263
|
-
if cfg.fasta_regions_of_interest and
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
output_FASTA =
|
|
267
|
-
|
|
268
|
-
|
|
373
|
+
if cfg.fasta_regions_of_interest and ".bed" in cfg.fasta_regions_of_interest:
|
|
374
|
+
fasta_stem = cfg.fasta.stem
|
|
375
|
+
bed_stem = Path(cfg.fasta_regions_of_interest).stem
|
|
376
|
+
output_FASTA = cfg.output_directory / f"{fasta_stem}_subsampled_by_{bed_stem}.fasta"
|
|
377
|
+
|
|
378
|
+
logger.info("Subsampling FASTA records using the provided BED file")
|
|
379
|
+
subsample_fasta_from_bed(
|
|
380
|
+
cfg.fasta, cfg.fasta_regions_of_interest, cfg.output_directory, output_FASTA
|
|
381
|
+
)
|
|
382
|
+
fasta = output_FASTA
|
|
269
383
|
else:
|
|
384
|
+
logger.info("Using the full FASTA file")
|
|
270
385
|
fasta = cfg.fasta
|
|
271
386
|
|
|
272
387
|
# For conversion style SMF, make a converted reference FASTA
|
|
273
|
-
if cfg.smf_modality ==
|
|
274
|
-
|
|
275
|
-
converted_FASTA_basename =
|
|
388
|
+
if cfg.smf_modality == "conversion":
|
|
389
|
+
fasta_stem = fasta.stem
|
|
390
|
+
converted_FASTA_basename = f"{fasta_stem}_converted.fasta"
|
|
276
391
|
converted_FASTA = cfg.output_directory / converted_FASTA_basename
|
|
277
|
-
|
|
278
|
-
|
|
392
|
+
|
|
393
|
+
if "converted.fa" in fasta.name:
|
|
394
|
+
logger.info(f"{fasta} is already converted. Using existing converted FASTA.")
|
|
279
395
|
converted_FASTA = fasta
|
|
280
396
|
elif converted_FASTA.exists():
|
|
281
|
-
|
|
397
|
+
logger.info(f"{converted_FASTA} already exists. Using existing converted FASTA.")
|
|
282
398
|
else:
|
|
399
|
+
logger.info(f"Converting FASTA base sequences")
|
|
283
400
|
generate_converted_FASTA(fasta, cfg.conversion_types, cfg.strands, converted_FASTA)
|
|
284
401
|
fasta = converted_FASTA
|
|
285
402
|
|
|
@@ -290,121 +407,176 @@ def load_adata_core(cfg, paths: AdataPaths):
|
|
|
290
407
|
########################################################################################################################
|
|
291
408
|
|
|
292
409
|
################################### 3) Basecalling ###################################
|
|
293
|
-
|
|
410
|
+
|
|
294
411
|
# 1) Basecall using dorado
|
|
295
|
-
if basecall and cfg.sequencer ==
|
|
412
|
+
if basecall and cfg.sequencer == "ont":
|
|
296
413
|
try:
|
|
297
414
|
cfg.model_dir = Path(cfg.model_dir)
|
|
298
|
-
except:
|
|
299
|
-
|
|
415
|
+
except Exception:
|
|
416
|
+
logger.warning(
|
|
417
|
+
"Need to provide a valid path to a dorado model directory to use dorado basecalling"
|
|
418
|
+
)
|
|
300
419
|
if aligned_sorted_output.exists():
|
|
301
|
-
|
|
420
|
+
logger.info(
|
|
421
|
+
f"{aligned_sorted_output} already exists. Using existing basecalled, aligned, sorted BAM."
|
|
422
|
+
)
|
|
302
423
|
elif unaligned_output.exists():
|
|
303
|
-
|
|
304
|
-
elif cfg.smf_modality !=
|
|
305
|
-
|
|
424
|
+
logger.info(f"{unaligned_output} already exists. Using existing basecalled BAM.")
|
|
425
|
+
elif cfg.smf_modality != "direct":
|
|
426
|
+
logger.info("Running canonical basecalling using dorado")
|
|
427
|
+
canoncall(
|
|
428
|
+
str(cfg.model_dir),
|
|
429
|
+
cfg.model,
|
|
430
|
+
str(cfg.input_data_path),
|
|
431
|
+
cfg.barcode_kit,
|
|
432
|
+
str(bam),
|
|
433
|
+
cfg.bam_suffix,
|
|
434
|
+
cfg.barcode_both_ends,
|
|
435
|
+
cfg.trim,
|
|
436
|
+
cfg.device,
|
|
437
|
+
)
|
|
306
438
|
else:
|
|
307
|
-
|
|
439
|
+
logger.info("Running modified basecalling using dorado")
|
|
440
|
+
modcall(
|
|
441
|
+
str(cfg.model_dir),
|
|
442
|
+
cfg.model,
|
|
443
|
+
str(cfg.input_data_path),
|
|
444
|
+
cfg.barcode_kit,
|
|
445
|
+
cfg.mod_list,
|
|
446
|
+
str(bam),
|
|
447
|
+
cfg.bam_suffix,
|
|
448
|
+
cfg.barcode_both_ends,
|
|
449
|
+
cfg.trim,
|
|
450
|
+
cfg.device,
|
|
451
|
+
)
|
|
308
452
|
elif basecall:
|
|
309
|
-
|
|
453
|
+
logger.error("Basecalling is currently only supported for ont sequencers and not pacbio.")
|
|
310
454
|
else:
|
|
311
455
|
pass
|
|
312
456
|
########################################################################################################################
|
|
313
457
|
|
|
314
458
|
################################### 4) Alignment and sorting #############################################
|
|
315
|
-
|
|
316
|
-
from ..informatics.bed_functions import aligned_BAM_to_bed
|
|
459
|
+
|
|
317
460
|
# 3) Align the BAM to the reference FASTA and sort the bam on positional coordinates. Also make an index and a bed file of mapped reads
|
|
318
461
|
if aligned_sorted_output.exists():
|
|
319
|
-
|
|
462
|
+
logger.debug(f"{aligned_sorted_output} already exists. Using existing aligned/sorted BAM.")
|
|
320
463
|
else:
|
|
464
|
+
logger.info(f"Aligning and sorting reads")
|
|
321
465
|
align_and_sort_BAM(fasta, unaligned_output, cfg)
|
|
322
466
|
# Deleted the unsorted aligned output
|
|
323
467
|
aligned_output.unlink()
|
|
324
468
|
|
|
325
469
|
if cfg.make_beds:
|
|
326
470
|
# Make beds and provide basic histograms
|
|
327
|
-
bed_dir = cfg.output_directory /
|
|
471
|
+
bed_dir = cfg.output_directory / "beds"
|
|
328
472
|
if bed_dir.is_dir():
|
|
329
|
-
|
|
473
|
+
logger.debug(
|
|
474
|
+
f"{bed_dir} already exists. Skipping BAM -> BED conversion for {aligned_sorted_output}"
|
|
475
|
+
)
|
|
330
476
|
else:
|
|
331
|
-
|
|
477
|
+
logger.info("Making bed files from the aligned and sorted BAM file")
|
|
478
|
+
aligned_BAM_to_bed(
|
|
479
|
+
aligned_sorted_output,
|
|
480
|
+
cfg.output_directory,
|
|
481
|
+
fasta,
|
|
482
|
+
cfg.make_bigwigs,
|
|
483
|
+
cfg.threads,
|
|
484
|
+
samtools_backend=cfg.samtools_backend,
|
|
485
|
+
bedtools_backend=cfg.bedtools_backend,
|
|
486
|
+
bigwig_backend=cfg.bigwig_backend,
|
|
487
|
+
)
|
|
332
488
|
########################################################################################################################
|
|
333
489
|
|
|
334
490
|
################################### 5) Demultiplexing ######################################################################
|
|
335
|
-
|
|
491
|
+
|
|
336
492
|
# 3) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory
|
|
337
493
|
if cfg.input_already_demuxed:
|
|
338
494
|
if cfg.split_path.is_dir():
|
|
339
|
-
|
|
495
|
+
logger.debug(f"{cfg.split_path} already exists. Using existing demultiplexed BAMs.")
|
|
340
496
|
|
|
341
497
|
all_bam_files = sorted(
|
|
342
|
-
p for p in cfg.split_path.iterdir()
|
|
343
|
-
if p.is_file()
|
|
344
|
-
and p.suffix == cfg.bam_suffix
|
|
498
|
+
p for p in cfg.split_path.iterdir() if p.is_file() and p.suffix == cfg.bam_suffix
|
|
345
499
|
)
|
|
346
500
|
unclassified_bams = [p for p in all_bam_files if "unclassified" in p.name]
|
|
347
501
|
bam_files = [p for p in all_bam_files if "unclassified" not in p.name]
|
|
348
502
|
|
|
349
503
|
else:
|
|
350
504
|
make_dirs([cfg.split_path])
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
505
|
+
logger.info("Demultiplexing samples into individual aligned/sorted BAM files")
|
|
506
|
+
all_bam_files = split_and_index_BAM(
|
|
507
|
+
aligned_sorted_BAM,
|
|
508
|
+
cfg.split_path,
|
|
509
|
+
cfg.bam_suffix,
|
|
510
|
+
samtools_backend=cfg.samtools_backend,
|
|
511
|
+
)
|
|
512
|
+
|
|
355
513
|
unclassified_bams = [p for p in all_bam_files if "unclassified" in p.name]
|
|
356
514
|
bam_files = sorted(p for p in all_bam_files if "unclassified" not in p.name)
|
|
357
515
|
|
|
358
516
|
se_bam_files = bam_files
|
|
359
517
|
bam_dir = cfg.split_path
|
|
360
|
-
|
|
518
|
+
|
|
361
519
|
else:
|
|
362
520
|
if single_barcoded_path.is_dir():
|
|
363
|
-
|
|
521
|
+
logger.debug(
|
|
522
|
+
f"{single_barcoded_path} already exists. Using existing single ended demultiplexed BAMs."
|
|
523
|
+
)
|
|
364
524
|
|
|
365
525
|
all_se_bam_files = sorted(
|
|
366
|
-
p
|
|
367
|
-
|
|
368
|
-
and p.suffix == cfg.bam_suffix
|
|
369
|
-
)
|
|
526
|
+
p
|
|
527
|
+
for p in single_barcoded_path.iterdir()
|
|
528
|
+
if p.is_file() and p.suffix == cfg.bam_suffix
|
|
529
|
+
)
|
|
370
530
|
unclassified_se_bams = [p for p in all_se_bam_files if "unclassified" in p.name]
|
|
371
531
|
se_bam_files = [p for p in all_se_bam_files if "unclassified" not in p.name]
|
|
372
532
|
else:
|
|
373
|
-
make_dirs([cfg.split_path, single_barcoded_path])
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
533
|
+
make_dirs([cfg.split_path, single_barcoded_path])
|
|
534
|
+
logger.info(
|
|
535
|
+
"Demultiplexing samples into individual aligned/sorted BAM files based on single end barcode status with Dorado"
|
|
536
|
+
)
|
|
537
|
+
all_se_bam_files = demux_and_index_BAM(
|
|
538
|
+
aligned_sorted_BAM,
|
|
539
|
+
single_barcoded_path,
|
|
540
|
+
cfg.bam_suffix,
|
|
541
|
+
cfg.barcode_kit,
|
|
542
|
+
False,
|
|
543
|
+
cfg.trim,
|
|
544
|
+
cfg.threads,
|
|
545
|
+
)
|
|
546
|
+
|
|
382
547
|
unclassified_se_bams = [p for p in all_se_bam_files if "unclassified" in p.name]
|
|
383
548
|
se_bam_files = [p for p in all_se_bam_files if "unclassified" not in p.name]
|
|
384
|
-
|
|
549
|
+
|
|
385
550
|
if double_barcoded_path.is_dir():
|
|
386
|
-
|
|
551
|
+
logger.debug(
|
|
552
|
+
f"{double_barcoded_path} already exists. Using existing double ended demultiplexed BAMs."
|
|
553
|
+
)
|
|
387
554
|
|
|
388
555
|
all_de_bam_files = sorted(
|
|
389
|
-
p
|
|
390
|
-
|
|
391
|
-
and p.suffix == cfg.bam_suffix
|
|
392
|
-
)
|
|
556
|
+
p
|
|
557
|
+
for p in double_barcoded_path.iterdir()
|
|
558
|
+
if p.is_file() and p.suffix == cfg.bam_suffix
|
|
559
|
+
)
|
|
393
560
|
unclassified_de_bams = [p for p in all_de_bam_files if "unclassified" in p.name]
|
|
394
561
|
de_bam_files = [p for p in all_de_bam_files if "unclassified" not in p.name]
|
|
395
|
-
else:
|
|
396
|
-
make_dirs([cfg.split_path, double_barcoded_path])
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
562
|
+
else:
|
|
563
|
+
make_dirs([cfg.split_path, double_barcoded_path])
|
|
564
|
+
logger.info(
|
|
565
|
+
"Demultiplexing samples into individual aligned/sorted BAM files based on double end barcode status with Dorado"
|
|
566
|
+
)
|
|
567
|
+
all_de_bam_files = demux_and_index_BAM(
|
|
568
|
+
aligned_sorted_BAM,
|
|
569
|
+
double_barcoded_path,
|
|
570
|
+
cfg.bam_suffix,
|
|
571
|
+
cfg.barcode_kit,
|
|
572
|
+
True,
|
|
573
|
+
cfg.trim,
|
|
574
|
+
cfg.threads,
|
|
575
|
+
)
|
|
576
|
+
|
|
405
577
|
unclassified_de_bams = [p for p in all_de_bam_files if "unclassified" in p.name]
|
|
406
578
|
de_bam_files = [p for p in all_de_bam_files if "unclassified" not in p.name]
|
|
407
|
-
|
|
579
|
+
|
|
408
580
|
bam_files = se_bam_files + de_bam_files
|
|
409
581
|
unclassified_bams = unclassified_se_bams + unclassified_de_bams
|
|
410
582
|
bam_dir = single_barcoded_path
|
|
@@ -413,225 +585,205 @@ def load_adata_core(cfg, paths: AdataPaths):
|
|
|
413
585
|
|
|
414
586
|
if cfg.make_beds:
|
|
415
587
|
# Make beds and provide basic histograms
|
|
416
|
-
bed_dir = cfg.split_path /
|
|
588
|
+
bed_dir = cfg.split_path / "beds"
|
|
417
589
|
if bed_dir.is_dir():
|
|
418
|
-
|
|
590
|
+
logger.debug(
|
|
591
|
+
f"{bed_dir} already exists. Skipping BAM -> BED conversion for demultiplexed bams"
|
|
592
|
+
)
|
|
419
593
|
else:
|
|
594
|
+
logger.info("Making BED files from BAM files for each sample")
|
|
420
595
|
for bam in bam_files:
|
|
421
|
-
aligned_BAM_to_bed(
|
|
596
|
+
aligned_BAM_to_bed(
|
|
597
|
+
bam,
|
|
598
|
+
cfg.split_path,
|
|
599
|
+
fasta,
|
|
600
|
+
cfg.make_bigwigs,
|
|
601
|
+
cfg.threads,
|
|
602
|
+
samtools_backend=cfg.samtools_backend,
|
|
603
|
+
bedtools_backend=cfg.bedtools_backend,
|
|
604
|
+
bigwig_backend=cfg.bigwig_backend,
|
|
605
|
+
)
|
|
422
606
|
########################################################################################################################
|
|
423
607
|
|
|
424
608
|
################################### 6) SAMTools based BAM QC ######################################################################
|
|
425
|
-
|
|
609
|
+
|
|
426
610
|
# 5) Samtools QC metrics on split BAM files
|
|
427
611
|
bam_qc_dir = cfg.split_path / "bam_qc"
|
|
428
612
|
if bam_qc_dir.is_dir():
|
|
429
|
-
|
|
613
|
+
logger.debug(f"{bam_qc_dir} already exists. Using existing BAM QC calculations.")
|
|
430
614
|
else:
|
|
431
615
|
make_dirs([bam_qc_dir])
|
|
432
|
-
|
|
433
|
-
|
|
616
|
+
logger.info("Performing BAM QC")
|
|
617
|
+
bam_qc(
|
|
618
|
+
bam_files,
|
|
619
|
+
bam_qc_dir,
|
|
620
|
+
cfg.threads,
|
|
621
|
+
modality=cfg.smf_modality,
|
|
622
|
+
samtools_backend=cfg.samtools_backend,
|
|
623
|
+
)
|
|
624
|
+
########################################################################################################################
|
|
434
625
|
|
|
435
626
|
################################### 7) AnnData loading ######################################################################
|
|
436
|
-
if cfg.smf_modality !=
|
|
627
|
+
if cfg.smf_modality != "direct":
|
|
437
628
|
from ..informatics.converted_BAM_to_adata import converted_BAM_to_adata
|
|
629
|
+
|
|
438
630
|
# 6) Take the converted BAM and load it into an adata object.
|
|
439
|
-
if cfg.smf_modality ==
|
|
631
|
+
if cfg.smf_modality == "deaminase":
|
|
440
632
|
deaminase_footprinting = True
|
|
441
633
|
else:
|
|
442
634
|
deaminase_footprinting = False
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
635
|
+
|
|
636
|
+
logger.info(f"Loading Anndata from BAM files for {cfg.smf_modality} footprinting")
|
|
637
|
+
raw_adata, raw_adata_path = converted_BAM_to_adata(
|
|
638
|
+
fasta,
|
|
639
|
+
bam_dir,
|
|
640
|
+
cfg.output_directory,
|
|
641
|
+
cfg.input_already_demuxed,
|
|
642
|
+
cfg.mapping_threshold,
|
|
643
|
+
cfg.experiment_name,
|
|
644
|
+
cfg.conversion_types,
|
|
645
|
+
cfg.bam_suffix,
|
|
646
|
+
cfg.device,
|
|
647
|
+
cfg.threads,
|
|
648
|
+
deaminase_footprinting,
|
|
649
|
+
delete_intermediates=cfg.delete_intermediate_hdfs,
|
|
650
|
+
double_barcoded_path=double_barcoded_path,
|
|
651
|
+
samtools_backend=cfg.samtools_backend,
|
|
652
|
+
)
|
|
456
653
|
else:
|
|
457
654
|
if mod_bed_dir.is_dir():
|
|
458
|
-
|
|
655
|
+
logger.debug(f"{mod_bed_dir} already exists, skipping making modbeds")
|
|
459
656
|
else:
|
|
460
|
-
from ..informatics.modkit_functions import
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
657
|
+
from ..informatics.modkit_functions import make_modbed, modQC
|
|
658
|
+
|
|
659
|
+
make_dirs([mod_bed_dir])
|
|
660
|
+
|
|
661
|
+
logger.info("Performing modQC for direct footprinting samples")
|
|
662
|
+
|
|
663
|
+
modQC(aligned_sorted_output, cfg.thresholds) # get QC metrics for mod calls
|
|
664
|
+
|
|
665
|
+
logger.info("Making modified BED files for direct footprinting samples")
|
|
666
|
+
|
|
667
|
+
make_modbed(
|
|
668
|
+
aligned_sorted_output, cfg.thresholds, mod_bed_dir
|
|
669
|
+
) # Generate bed files of position methylation summaries for every sample
|
|
670
|
+
|
|
470
671
|
from ..informatics.modkit_functions import extract_mods
|
|
672
|
+
|
|
471
673
|
make_dirs([mod_tsv_dir])
|
|
472
674
|
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
675
|
+
logger.info(
|
|
676
|
+
"Extracting single read modification states into TSVs for direct footprinting samples"
|
|
677
|
+
)
|
|
678
|
+
|
|
679
|
+
extract_mods(
|
|
680
|
+
cfg.thresholds,
|
|
681
|
+
mod_tsv_dir,
|
|
682
|
+
bam_dir,
|
|
683
|
+
cfg.bam_suffix,
|
|
684
|
+
skip_unclassified=cfg.skip_unclassified,
|
|
685
|
+
modkit_summary=False,
|
|
686
|
+
threads=cfg.threads,
|
|
687
|
+
) # Extract methylations calls for split BAM files into split TSV files
|
|
688
|
+
|
|
481
689
|
from ..informatics.modkit_extract_to_adata import modkit_extract_to_adata
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
690
|
+
|
|
691
|
+
logger.info("Making Anndata for direct modification detection SMF samples")
|
|
692
|
+
|
|
693
|
+
# 6 Load the modification data from TSVs into an adata object
|
|
694
|
+
raw_adata, raw_adata_path = modkit_extract_to_adata(
|
|
695
|
+
fasta,
|
|
696
|
+
bam_dir,
|
|
697
|
+
cfg.output_directory,
|
|
698
|
+
cfg.input_already_demuxed,
|
|
699
|
+
cfg.mapping_threshold,
|
|
700
|
+
cfg.experiment_name,
|
|
701
|
+
mods,
|
|
702
|
+
cfg.batch_size,
|
|
703
|
+
mod_tsv_dir,
|
|
704
|
+
cfg.delete_batch_hdfs,
|
|
705
|
+
cfg.threads,
|
|
706
|
+
double_barcoded_path,
|
|
707
|
+
cfg.samtools_backend,
|
|
708
|
+
)
|
|
495
709
|
if cfg.delete_intermediate_tsvs:
|
|
496
710
|
delete_tsvs(mod_tsv_dir)
|
|
497
711
|
|
|
498
|
-
raw_adata.obs[
|
|
499
|
-
raw_adata.obs[
|
|
712
|
+
raw_adata.obs["Experiment_name"] = [cfg.experiment_name] * raw_adata.shape[0]
|
|
713
|
+
raw_adata.obs["Experiment_name_and_barcode"] = (
|
|
714
|
+
raw_adata.obs["Experiment_name"].astype(str) + "_" + raw_adata.obs["Barcode"].astype(str)
|
|
715
|
+
)
|
|
500
716
|
|
|
501
717
|
########################################################################################################################
|
|
502
718
|
|
|
503
719
|
############################################### Add basic read length, read quality, mapping quality stats ###############################################
|
|
504
|
-
from ..informatics.h5ad_functions import add_read_length_and_mapping_qc
|
|
505
|
-
from ..informatics.bam_functions import extract_read_features_from_bam
|
|
506
|
-
add_read_length_and_mapping_qc(raw_adata, se_bam_files,
|
|
507
|
-
extract_read_features_from_bam_callable=extract_read_features_from_bam,
|
|
508
|
-
bypass=cfg.bypass_add_read_length_and_mapping_qc,
|
|
509
|
-
force_redo=cfg.force_redo_add_read_length_and_mapping_qc)
|
|
510
720
|
|
|
511
|
-
|
|
721
|
+
logger.info("Adding read length, mapping quality, and modification signal to Anndata")
|
|
722
|
+
add_read_length_and_mapping_qc(
|
|
723
|
+
raw_adata,
|
|
724
|
+
se_bam_files,
|
|
725
|
+
extract_read_features_from_bam_callable=extract_read_features_from_bam,
|
|
726
|
+
bypass=cfg.bypass_add_read_length_and_mapping_qc,
|
|
727
|
+
force_redo=cfg.force_redo_add_read_length_and_mapping_qc,
|
|
728
|
+
samtools_backend=cfg.samtools_backend,
|
|
729
|
+
)
|
|
730
|
+
|
|
731
|
+
raw_adata.obs["Raw_modification_signal"] = np.nansum(raw_adata.X, axis=1)
|
|
732
|
+
########################################################################################################################
|
|
733
|
+
|
|
734
|
+
############################################### if input data type was pod5, append the pod5 file origin to each read ###############################################
|
|
735
|
+
from ..informatics.h5ad_functions import annotate_pod5_origin
|
|
736
|
+
|
|
737
|
+
if cfg.input_type == "pod5":
|
|
738
|
+
logger.info("Adding the POD5 origin file to each read into Anndata")
|
|
739
|
+
annotate_pod5_origin(
|
|
740
|
+
raw_adata,
|
|
741
|
+
cfg.input_data_path,
|
|
742
|
+
n_jobs=cfg.threads,
|
|
743
|
+
csv_path=output_directory / "read_to_pod5_origin_mapping.csv",
|
|
744
|
+
)
|
|
512
745
|
########################################################################################################################
|
|
513
746
|
|
|
514
747
|
############################################### Save final adata ###############################################
|
|
515
|
-
|
|
748
|
+
logger.info(f"Saving AnnData to {raw_adata_path}")
|
|
749
|
+
record_smftools_metadata(
|
|
750
|
+
raw_adata,
|
|
751
|
+
step_name="load",
|
|
752
|
+
cfg=cfg,
|
|
753
|
+
config_path=config_path,
|
|
754
|
+
output_path=raw_adata_path,
|
|
755
|
+
)
|
|
516
756
|
write_gz_h5ad(raw_adata, raw_adata_path)
|
|
517
757
|
########################################################################################################################
|
|
518
758
|
|
|
519
759
|
############################################### MultiQC HTML Report ###############################################
|
|
520
|
-
|
|
760
|
+
|
|
521
761
|
# multiqc ###
|
|
522
762
|
mqc_dir = cfg.split_path / "multiqc"
|
|
523
763
|
if mqc_dir.is_dir():
|
|
524
|
-
|
|
764
|
+
logger.info(f"{mqc_dir} already exists, skipping multiqc")
|
|
525
765
|
else:
|
|
766
|
+
logger.info("Running multiqc")
|
|
526
767
|
run_multiqc(cfg.split_path, mqc_dir)
|
|
527
768
|
########################################################################################################################
|
|
528
769
|
|
|
529
770
|
############################################### delete intermediate BAM files ###############################################
|
|
530
771
|
if cfg.delete_intermediate_bams:
|
|
772
|
+
logger.info("Deleting intermediate BAM files")
|
|
531
773
|
# delete aligned and sorted bam
|
|
532
774
|
aligned_sorted_output.unlink()
|
|
533
|
-
bai = aligned_sorted_output.parent / (aligned_sorted_output.name +
|
|
775
|
+
bai = aligned_sorted_output.parent / (aligned_sorted_output.name + ".bai")
|
|
534
776
|
bai.unlink()
|
|
535
777
|
# delete the demultiplexed bams. Keep the demultiplexing summary files and directories to faciliate demultiplexing in the future with these files
|
|
536
778
|
for bam in bam_files:
|
|
537
|
-
bai = bam.parent / (bam.name +
|
|
779
|
+
bai = bam.parent / (bam.name + ".bai")
|
|
538
780
|
bam.unlink()
|
|
539
781
|
bai.unlink()
|
|
540
782
|
for bam in unclassified_bams:
|
|
541
|
-
bai = bam.parent / (bam.name +
|
|
783
|
+
bai = bam.parent / (bam.name + ".bai")
|
|
542
784
|
bam.unlink()
|
|
543
|
-
bai.unlink()
|
|
785
|
+
bai.unlink()
|
|
786
|
+
logger.info("Finished deleting intermediate BAM files")
|
|
544
787
|
########################################################################################################################
|
|
545
788
|
|
|
546
789
|
return raw_adata, raw_adata_path, cfg
|
|
547
|
-
|
|
548
|
-
def load_adata(config_path: str):
|
|
549
|
-
"""
|
|
550
|
-
CLI-facing wrapper for the load pipeline.
|
|
551
|
-
|
|
552
|
-
- Reads config CSV into ExperimentConfig
|
|
553
|
-
- Computes canonical paths for all downstream AnnData stages
|
|
554
|
-
- Registers those in the summary CSV
|
|
555
|
-
- Applies stage-skipping logic (hmm > spatial > pp_dedup > pp > raw)
|
|
556
|
-
- If needed, calls the core pipeline to actually build the raw AnnData
|
|
557
|
-
|
|
558
|
-
Returns
|
|
559
|
-
-------
|
|
560
|
-
adata : anndata.AnnData | None
|
|
561
|
-
Newly created AnnData object, or None if we skipped because a later-stage
|
|
562
|
-
AnnData already exists.
|
|
563
|
-
adata_path : pathlib.Path
|
|
564
|
-
Path to the "current" AnnData that should be used downstream.
|
|
565
|
-
cfg : ExperimentConfig
|
|
566
|
-
Config object for downstream steps.
|
|
567
|
-
"""
|
|
568
|
-
from importlib import resources
|
|
569
|
-
from datetime import datetime
|
|
570
|
-
from pathlib import Path
|
|
571
|
-
|
|
572
|
-
import pandas as pd # used for summary file reading downstream if needed
|
|
573
|
-
|
|
574
|
-
from ..readwrite import make_dirs, add_or_update_column_in_csv
|
|
575
|
-
from ..config import LoadExperimentConfig, ExperimentConfig
|
|
576
|
-
|
|
577
|
-
from .helpers import get_adata_paths
|
|
578
|
-
|
|
579
|
-
date_str = datetime.today().strftime("%y%m%d")
|
|
580
|
-
|
|
581
|
-
# -----------------------------
|
|
582
|
-
# 1) Load config into cfg
|
|
583
|
-
# -----------------------------
|
|
584
|
-
loader = LoadExperimentConfig(config_path)
|
|
585
|
-
defaults_dir = resources.files("smftools").joinpath("config")
|
|
586
|
-
cfg, report = ExperimentConfig.from_var_dict(
|
|
587
|
-
loader.var_dict, date_str=date_str, defaults_dir=defaults_dir
|
|
588
|
-
)
|
|
589
|
-
|
|
590
|
-
# Ensure base output dir
|
|
591
|
-
make_dirs([cfg.output_directory])
|
|
592
|
-
|
|
593
|
-
# -----------------------------
|
|
594
|
-
# 2) Compute and register paths
|
|
595
|
-
# -----------------------------
|
|
596
|
-
paths = get_adata_paths(cfg)
|
|
597
|
-
|
|
598
|
-
# experiment-level metadata in summary CSV
|
|
599
|
-
add_or_update_column_in_csv(cfg.summary_file, "experiment_name", cfg.experiment_name)
|
|
600
|
-
add_or_update_column_in_csv(cfg.summary_file, "config_path", config_path)
|
|
601
|
-
add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
|
|
602
|
-
add_or_update_column_in_csv(cfg.summary_file, "input_files", [cfg.input_files])
|
|
603
|
-
|
|
604
|
-
# AnnData stage paths
|
|
605
|
-
add_or_update_column_in_csv(cfg.summary_file, "load_adata", paths.raw)
|
|
606
|
-
add_or_update_column_in_csv(cfg.summary_file, "pp_adata", paths.pp)
|
|
607
|
-
add_or_update_column_in_csv(cfg.summary_file, "pp_dedup_adata", paths.pp_dedup)
|
|
608
|
-
add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", paths.spatial)
|
|
609
|
-
add_or_update_column_in_csv(cfg.summary_file, "hmm_adata", paths.hmm)
|
|
610
|
-
|
|
611
|
-
# -----------------------------
|
|
612
|
-
# 3) Stage skipping logic
|
|
613
|
-
# -----------------------------
|
|
614
|
-
if not getattr(cfg, "force_redo_load_adata", False):
|
|
615
|
-
if paths.hmm.exists():
|
|
616
|
-
print(f"HMM AnnData already exists: {paths.hmm}\nSkipping smftools load")
|
|
617
|
-
return None, paths.hmm, cfg
|
|
618
|
-
if paths.spatial.exists():
|
|
619
|
-
print(f"Spatial AnnData already exists: {paths.spatial}\nSkipping smftools load")
|
|
620
|
-
return None, paths.spatial, cfg
|
|
621
|
-
if paths.pp_dedup.exists():
|
|
622
|
-
print(
|
|
623
|
-
f"Preprocessed deduplicated AnnData already exists: {paths.pp_dedup}\n"
|
|
624
|
-
f"Skipping smftools load"
|
|
625
|
-
)
|
|
626
|
-
return None, paths.pp_dedup, cfg
|
|
627
|
-
if paths.pp.exists():
|
|
628
|
-
print(f"Preprocessed AnnData already exists: {paths.pp}\nSkipping smftools load")
|
|
629
|
-
return None, paths.pp, cfg
|
|
630
|
-
if paths.raw.exists():
|
|
631
|
-
print(f"Raw AnnData from smftools load already exists: {paths.raw}\nSkipping smftools load")
|
|
632
|
-
return None, paths.raw, cfg
|
|
633
|
-
|
|
634
|
-
# If we get here, we actually want to run the full load pipeline
|
|
635
|
-
adata, adata_path, cfg = load_adata_core(cfg, paths)
|
|
636
|
-
|
|
637
|
-
return adata, adata_path, cfg
|