smftools 0.2.5__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +39 -7
- smftools/_settings.py +2 -0
- smftools/_version.py +3 -1
- smftools/cli/__init__.py +1 -0
- smftools/cli/archived/cli_flows.py +2 -0
- smftools/cli/helpers.py +34 -6
- smftools/cli/hmm_adata.py +239 -33
- smftools/cli/latent_adata.py +318 -0
- smftools/cli/load_adata.py +167 -131
- smftools/cli/preprocess_adata.py +180 -53
- smftools/cli/spatial_adata.py +152 -100
- smftools/cli_entry.py +38 -1
- smftools/config/__init__.py +2 -0
- smftools/config/conversion.yaml +11 -1
- smftools/config/default.yaml +42 -2
- smftools/config/experiment_config.py +59 -1
- smftools/constants.py +65 -0
- smftools/datasets/__init__.py +2 -0
- smftools/hmm/HMM.py +97 -3
- smftools/hmm/__init__.py +24 -13
- smftools/hmm/archived/apply_hmm_batched.py +2 -0
- smftools/hmm/archived/calculate_distances.py +2 -0
- smftools/hmm/archived/call_hmm_peaks.py +2 -0
- smftools/hmm/archived/train_hmm.py +2 -0
- smftools/hmm/call_hmm_peaks.py +5 -2
- smftools/hmm/display_hmm.py +4 -1
- smftools/hmm/hmm_readwrite.py +7 -2
- smftools/hmm/nucleosome_hmm_refinement.py +2 -0
- smftools/informatics/__init__.py +59 -34
- smftools/informatics/archived/bam_conversion.py +2 -0
- smftools/informatics/archived/bam_direct.py +2 -0
- smftools/informatics/archived/basecall_pod5s.py +2 -0
- smftools/informatics/archived/basecalls_to_adata.py +2 -0
- smftools/informatics/archived/conversion_smf.py +2 -0
- smftools/informatics/archived/deaminase_smf.py +1 -0
- smftools/informatics/archived/direct_smf.py +2 -0
- smftools/informatics/archived/fast5_to_pod5.py +2 -0
- smftools/informatics/archived/helpers/archived/__init__.py +2 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
- smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
- smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
- smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
- smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
- smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
- smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
- smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
- smftools/informatics/archived/helpers/archived/informatics.py +2 -0
- smftools/informatics/archived/helpers/archived/load_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
- smftools/informatics/archived/helpers/archived/modQC.py +2 -0
- smftools/informatics/archived/helpers/archived/modcall.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +2 -0
- smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
- smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
- smftools/informatics/archived/print_bam_query_seq.py +2 -0
- smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
- smftools/informatics/archived/subsample_pod5.py +2 -0
- smftools/informatics/bam_functions.py +1093 -176
- smftools/informatics/basecalling.py +2 -0
- smftools/informatics/bed_functions.py +271 -61
- smftools/informatics/binarize_converted_base_identities.py +3 -0
- smftools/informatics/complement_base_list.py +2 -0
- smftools/informatics/converted_BAM_to_adata.py +641 -176
- smftools/informatics/fasta_functions.py +94 -10
- smftools/informatics/h5ad_functions.py +123 -4
- smftools/informatics/modkit_extract_to_adata.py +1019 -431
- smftools/informatics/modkit_functions.py +2 -0
- smftools/informatics/ohe.py +2 -0
- smftools/informatics/pod5_functions.py +3 -2
- smftools/informatics/sequence_encoding.py +72 -0
- smftools/logging_utils.py +21 -2
- smftools/machine_learning/__init__.py +22 -6
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +18 -4
- smftools/machine_learning/data/preprocessing.py +2 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +2 -0
- smftools/machine_learning/evaluation/evaluators.py +14 -9
- smftools/machine_learning/inference/__init__.py +2 -0
- smftools/machine_learning/inference/inference_utils.py +2 -0
- smftools/machine_learning/inference/lightning_inference.py +6 -1
- smftools/machine_learning/inference/sklearn_inference.py +2 -0
- smftools/machine_learning/inference/sliding_window_inference.py +2 -0
- smftools/machine_learning/models/__init__.py +2 -0
- smftools/machine_learning/models/base.py +7 -2
- smftools/machine_learning/models/cnn.py +7 -2
- smftools/machine_learning/models/lightning_base.py +16 -11
- smftools/machine_learning/models/mlp.py +5 -1
- smftools/machine_learning/models/positional.py +7 -2
- smftools/machine_learning/models/rnn.py +5 -1
- smftools/machine_learning/models/sklearn_models.py +14 -9
- smftools/machine_learning/models/transformer.py +7 -2
- smftools/machine_learning/models/wrappers.py +6 -2
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +13 -3
- smftools/machine_learning/training/train_sklearn_model.py +2 -0
- smftools/machine_learning/utils/__init__.py +2 -0
- smftools/machine_learning/utils/device.py +5 -1
- smftools/machine_learning/utils/grl.py +5 -1
- smftools/metadata.py +1 -1
- smftools/optional_imports.py +31 -0
- smftools/plotting/__init__.py +41 -31
- smftools/plotting/autocorrelation_plotting.py +9 -5
- smftools/plotting/classifiers.py +16 -4
- smftools/plotting/general_plotting.py +2415 -629
- smftools/plotting/hmm_plotting.py +97 -9
- smftools/plotting/position_stats.py +15 -7
- smftools/plotting/qc_plotting.py +6 -1
- smftools/preprocessing/__init__.py +36 -37
- smftools/preprocessing/append_base_context.py +17 -17
- smftools/preprocessing/append_mismatch_frequency_sites.py +158 -0
- smftools/preprocessing/archived/add_read_length_and_mapping_qc.py +2 -0
- smftools/preprocessing/archived/calculate_complexity.py +2 -0
- smftools/preprocessing/archived/mark_duplicates.py +2 -0
- smftools/preprocessing/archived/preprocessing.py +2 -0
- smftools/preprocessing/archived/remove_duplicates.py +2 -0
- smftools/preprocessing/binary_layers_to_ohe.py +2 -1
- smftools/preprocessing/calculate_complexity_II.py +4 -1
- smftools/preprocessing/calculate_consensus.py +1 -1
- smftools/preprocessing/calculate_pairwise_differences.py +2 -0
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +3 -0
- smftools/preprocessing/calculate_position_Youden.py +9 -2
- smftools/preprocessing/calculate_read_modification_stats.py +6 -1
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +2 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +2 -0
- smftools/preprocessing/flag_duplicate_reads.py +42 -54
- smftools/preprocessing/make_dirs.py +2 -1
- smftools/preprocessing/min_non_diagonal.py +2 -0
- smftools/preprocessing/recipes.py +2 -0
- smftools/readwrite.py +53 -17
- smftools/schema/anndata_schema_v1.yaml +15 -1
- smftools/tools/__init__.py +30 -18
- smftools/tools/archived/apply_hmm.py +2 -0
- smftools/tools/archived/classifiers.py +2 -0
- smftools/tools/archived/classify_methylated_features.py +2 -0
- smftools/tools/archived/classify_non_methylated_features.py +2 -0
- smftools/tools/archived/subset_adata_v1.py +2 -0
- smftools/tools/archived/subset_adata_v2.py +2 -0
- smftools/tools/calculate_leiden.py +57 -0
- smftools/tools/calculate_nmf.py +119 -0
- smftools/tools/calculate_umap.py +93 -8
- smftools/tools/cluster_adata_on_methylation.py +7 -1
- smftools/tools/position_stats.py +17 -27
- smftools/tools/rolling_nn_distance.py +235 -0
- smftools/tools/tensor_factorization.py +169 -0
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/METADATA +69 -33
- smftools-0.3.1.dist-info/RECORD +189 -0
- smftools-0.2.5.dist-info/RECORD +0 -181
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/WHEEL +0 -0
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/licenses/LICENSE +0 -0
smftools/cli/load_adata.py
CHANGED
|
@@ -1,8 +1,14 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
1
4
|
import shutil
|
|
2
5
|
from pathlib import Path
|
|
3
6
|
from typing import Iterable, Union
|
|
4
7
|
|
|
5
|
-
|
|
8
|
+
import numpy as np
|
|
9
|
+
|
|
10
|
+
from smftools.constants import HMM_DIR, LOAD_DIR, LOGGING_DIR, PREPROCESS_DIR, SPATIAL_DIR
|
|
11
|
+
from smftools.logging_utils import get_logger, setup_logging
|
|
6
12
|
|
|
7
13
|
from .helpers import AdataPaths
|
|
8
14
|
|
|
@@ -76,6 +82,62 @@ def delete_tsvs(
|
|
|
76
82
|
logger.warning(f"[error] failed to remove tmp dir {td}: {e}")
|
|
77
83
|
|
|
78
84
|
|
|
85
|
+
def load_adata(config_path: str):
|
|
86
|
+
"""
|
|
87
|
+
CLI-facing wrapper for the load pipeline.
|
|
88
|
+
|
|
89
|
+
- Reads config CSV into ExperimentConfig
|
|
90
|
+
- Computes canonical paths for all downstream AnnData stages
|
|
91
|
+
- Registers those in the summary CSV
|
|
92
|
+
- Applies stage-skipping logic (hmm > spatial > pp_dedup > pp > raw)
|
|
93
|
+
- If needed, calls the core pipeline to actually build the raw AnnData
|
|
94
|
+
|
|
95
|
+
Returns
|
|
96
|
+
-------
|
|
97
|
+
adata : anndata.AnnData | None
|
|
98
|
+
Newly created AnnData object, or None if we skipped because a later-stage
|
|
99
|
+
AnnData already exists.
|
|
100
|
+
adata_path : pathlib.Path
|
|
101
|
+
Path to the "current" AnnData that should be used downstream.
|
|
102
|
+
cfg : ExperimentConfig
|
|
103
|
+
Config object for downstream steps.
|
|
104
|
+
"""
|
|
105
|
+
from datetime import datetime
|
|
106
|
+
from importlib import resources
|
|
107
|
+
|
|
108
|
+
from ..readwrite import add_or_update_column_in_csv, make_dirs
|
|
109
|
+
from .helpers import get_adata_paths, load_experiment_config
|
|
110
|
+
|
|
111
|
+
# -----------------------------
|
|
112
|
+
# 1) Load config into cfg
|
|
113
|
+
# -----------------------------
|
|
114
|
+
cfg = load_experiment_config(config_path)
|
|
115
|
+
|
|
116
|
+
# Ensure base output dir
|
|
117
|
+
output_directory = Path(cfg.output_directory)
|
|
118
|
+
make_dirs([output_directory])
|
|
119
|
+
|
|
120
|
+
# -----------------------------
|
|
121
|
+
# 2) Compute and register paths
|
|
122
|
+
# -----------------------------
|
|
123
|
+
paths = get_adata_paths(cfg)
|
|
124
|
+
|
|
125
|
+
# -----------------------------
|
|
126
|
+
# 3) Stage skipping logic
|
|
127
|
+
# -----------------------------
|
|
128
|
+
if not getattr(cfg, "force_redo_load_adata", False):
|
|
129
|
+
if paths.raw.exists():
|
|
130
|
+
logger.info(
|
|
131
|
+
f"Raw AnnData from smftools load already exists: {paths.raw}\nSkipping smftools load"
|
|
132
|
+
)
|
|
133
|
+
return None, paths.raw, cfg
|
|
134
|
+
|
|
135
|
+
# If we get here, we actually want to run the full load pipeline
|
|
136
|
+
adata, adata_path, cfg = load_adata_core(cfg, paths, config_path=config_path)
|
|
137
|
+
|
|
138
|
+
return adata, adata_path, cfg
|
|
139
|
+
|
|
140
|
+
|
|
79
141
|
def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
80
142
|
"""
|
|
81
143
|
Core load pipeline.
|
|
@@ -105,9 +167,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
105
167
|
cfg : ExperimentConfig
|
|
106
168
|
(Same object, possibly with some fields updated, e.g. fasta path.)
|
|
107
169
|
"""
|
|
108
|
-
from
|
|
109
|
-
|
|
110
|
-
import numpy as np
|
|
170
|
+
from datetime import datetime
|
|
111
171
|
|
|
112
172
|
from ..informatics.bam_functions import (
|
|
113
173
|
align_and_sort_BAM,
|
|
@@ -115,6 +175,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
115
175
|
concatenate_fastqs_to_bam,
|
|
116
176
|
demux_and_index_BAM,
|
|
117
177
|
extract_read_features_from_bam,
|
|
178
|
+
extract_read_tags_from_bam,
|
|
118
179
|
split_and_index_BAM,
|
|
119
180
|
)
|
|
120
181
|
from ..informatics.basecalling import canoncall, modcall
|
|
@@ -125,7 +186,11 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
125
186
|
get_chromosome_lengths,
|
|
126
187
|
subsample_fasta_from_bed,
|
|
127
188
|
)
|
|
128
|
-
from ..informatics.h5ad_functions import
|
|
189
|
+
from ..informatics.h5ad_functions import (
|
|
190
|
+
add_read_length_and_mapping_qc,
|
|
191
|
+
add_read_tag_annotations,
|
|
192
|
+
add_secondary_supplementary_alignment_flags,
|
|
193
|
+
)
|
|
129
194
|
from ..informatics.modkit_extract_to_adata import modkit_extract_to_adata
|
|
130
195
|
from ..informatics.modkit_functions import extract_mods, make_modbed, modQC
|
|
131
196
|
from ..informatics.pod5_functions import fast5_to_pod5
|
|
@@ -135,8 +200,25 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
135
200
|
from .helpers import write_gz_h5ad
|
|
136
201
|
|
|
137
202
|
################################### 1) General params and input organization ###################################
|
|
203
|
+
date_str = datetime.today().strftime("%y%m%d")
|
|
204
|
+
now = datetime.now()
|
|
205
|
+
time_str = now.strftime("%H%M%S")
|
|
206
|
+
|
|
207
|
+
log_level = getattr(logging, cfg.log_level.upper(), logging.INFO)
|
|
208
|
+
|
|
138
209
|
output_directory = Path(cfg.output_directory)
|
|
139
|
-
|
|
210
|
+
load_directory = output_directory / LOAD_DIR
|
|
211
|
+
logging_directory = load_directory / LOGGING_DIR
|
|
212
|
+
|
|
213
|
+
make_dirs([output_directory, load_directory])
|
|
214
|
+
|
|
215
|
+
if cfg.emit_log_file:
|
|
216
|
+
log_file = logging_directory / f"{date_str}_{time_str}_log.log"
|
|
217
|
+
make_dirs([logging_directory])
|
|
218
|
+
else:
|
|
219
|
+
log_file = None
|
|
220
|
+
|
|
221
|
+
setup_logging(level=log_level, log_file=log_file, reconfigure=log_file is not None)
|
|
140
222
|
|
|
141
223
|
raw_adata_path = paths.raw
|
|
142
224
|
pp_adata_path = paths.pp
|
|
@@ -150,11 +232,9 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
150
232
|
|
|
151
233
|
# Direct methylation detection SMF specific parameters
|
|
152
234
|
if cfg.smf_modality == "direct":
|
|
153
|
-
mod_bed_dir =
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
add_or_update_column_in_csv(cfg.summary_file, "mod_tsv_dir", mod_tsv_dir)
|
|
157
|
-
bam_qc_dir = cfg.output_directory / "bam_qc"
|
|
235
|
+
mod_bed_dir = load_directory / "mod_beds"
|
|
236
|
+
mod_tsv_dir = load_directory / "mod_tsvs"
|
|
237
|
+
bam_qc_dir = load_directory / "bam_qc"
|
|
158
238
|
mods = [cfg.mod_map[mod] for mod in cfg.mod_list]
|
|
159
239
|
|
|
160
240
|
if not check_executable_exists("dorado"):
|
|
@@ -190,7 +270,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
190
270
|
# If the input files are fast5 files, convert the files to a pod5 file before proceeding.
|
|
191
271
|
if cfg.input_type == "fast5":
|
|
192
272
|
# take the input directory of fast5 files and write out a single pod5 file into the output directory.
|
|
193
|
-
output_pod5 =
|
|
273
|
+
output_pod5 = load_directory / "FAST5s_to_POD5.pod5"
|
|
194
274
|
if output_pod5.exists():
|
|
195
275
|
pass
|
|
196
276
|
else:
|
|
@@ -204,7 +284,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
204
284
|
# If the input is a fastq or a directory of fastqs, concatenate them into an unaligned BAM and save the barcode
|
|
205
285
|
elif cfg.input_type == "fastq":
|
|
206
286
|
# Output file for FASTQ concatenation.
|
|
207
|
-
output_bam =
|
|
287
|
+
output_bam = load_directory / "canonical_basecalls.bam"
|
|
208
288
|
if output_bam.exists():
|
|
209
289
|
logger.debug("Output BAM already exists")
|
|
210
290
|
else:
|
|
@@ -219,6 +299,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
219
299
|
rg_sample_field=None,
|
|
220
300
|
progress=False,
|
|
221
301
|
auto_pair=cfg.fastq_auto_pairing,
|
|
302
|
+
samtools_backend=cfg.samtools_backend,
|
|
222
303
|
)
|
|
223
304
|
|
|
224
305
|
logger.info(f"Found the following barcodes in FASTQ inputs: {summary['barcodes']}")
|
|
@@ -231,8 +312,6 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
231
312
|
else:
|
|
232
313
|
pass
|
|
233
314
|
|
|
234
|
-
add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
|
|
235
|
-
|
|
236
315
|
# Determine if the input data needs to be basecalled
|
|
237
316
|
if cfg.input_type == "pod5":
|
|
238
317
|
logger.info(f"Detected pod5 inputs: {cfg.input_files}")
|
|
@@ -249,25 +328,24 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
249
328
|
model_basename = str(model_basename).replace(".", "_")
|
|
250
329
|
if cfg.smf_modality == "direct":
|
|
251
330
|
mod_string = "_".join(cfg.mod_list)
|
|
252
|
-
bam =
|
|
331
|
+
bam = load_directory / f"{model_basename}_{mod_string}_calls"
|
|
253
332
|
else:
|
|
254
|
-
bam =
|
|
333
|
+
bam = load_directory / f"{model_basename}_canonical_basecalls"
|
|
255
334
|
else:
|
|
256
|
-
bam_base = cfg.input_data_path.
|
|
257
|
-
bam = cfg.
|
|
335
|
+
bam_base = cfg.input_data_path.stem
|
|
336
|
+
bam = cfg.input_data_path.parent / bam_base
|
|
258
337
|
|
|
259
338
|
# Generate path names for the unaligned, aligned, as well as the aligned/sorted bam.
|
|
260
339
|
unaligned_output = bam.with_suffix(cfg.bam_suffix)
|
|
340
|
+
|
|
261
341
|
aligned_BAM = (
|
|
262
|
-
|
|
342
|
+
load_directory / (bam.stem + "_aligned")
|
|
263
343
|
) # doing this allows specifying an input bam in a seperate directory as the aligned output bams
|
|
344
|
+
|
|
264
345
|
aligned_output = aligned_BAM.with_suffix(cfg.bam_suffix)
|
|
265
346
|
aligned_sorted_BAM = aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
|
|
266
347
|
aligned_sorted_output = aligned_sorted_BAM.with_suffix(cfg.bam_suffix)
|
|
267
348
|
|
|
268
|
-
add_or_update_column_in_csv(cfg.summary_file, "basecalled_bam", unaligned_output)
|
|
269
|
-
add_or_update_column_in_csv(cfg.summary_file, "aligned_bam", aligned_output)
|
|
270
|
-
add_or_update_column_in_csv(cfg.summary_file, "sorted_bam", aligned_sorted_output)
|
|
271
349
|
########################################################################################################################
|
|
272
350
|
|
|
273
351
|
################################### 2) FASTA Handling ###################################
|
|
@@ -281,11 +359,11 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
281
359
|
if cfg.fasta_regions_of_interest and ".bed" in cfg.fasta_regions_of_interest:
|
|
282
360
|
fasta_stem = cfg.fasta.stem
|
|
283
361
|
bed_stem = Path(cfg.fasta_regions_of_interest).stem
|
|
284
|
-
output_FASTA =
|
|
362
|
+
output_FASTA = load_directory / f"{fasta_stem}_subsampled_by_{bed_stem}.fasta"
|
|
285
363
|
|
|
286
364
|
logger.info("Subsampling FASTA records using the provided BED file")
|
|
287
365
|
subsample_fasta_from_bed(
|
|
288
|
-
cfg.fasta, cfg.fasta_regions_of_interest,
|
|
366
|
+
cfg.fasta, cfg.fasta_regions_of_interest, load_directory, output_FASTA
|
|
289
367
|
)
|
|
290
368
|
fasta = output_FASTA
|
|
291
369
|
else:
|
|
@@ -296,7 +374,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
296
374
|
if cfg.smf_modality == "conversion":
|
|
297
375
|
fasta_stem = fasta.stem
|
|
298
376
|
converted_FASTA_basename = f"{fasta_stem}_converted.fasta"
|
|
299
|
-
converted_FASTA =
|
|
377
|
+
converted_FASTA = load_directory / converted_FASTA_basename
|
|
300
378
|
|
|
301
379
|
if "converted.fa" in fasta.name:
|
|
302
380
|
logger.info(f"{fasta} is already converted. Using existing converted FASTA.")
|
|
@@ -308,8 +386,6 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
308
386
|
generate_converted_FASTA(fasta, cfg.conversion_types, cfg.strands, converted_FASTA)
|
|
309
387
|
fasta = converted_FASTA
|
|
310
388
|
|
|
311
|
-
add_or_update_column_in_csv(cfg.summary_file, "fasta", fasta)
|
|
312
|
-
|
|
313
389
|
# Make a FAI and .chrom.names file for the fasta
|
|
314
390
|
get_chromosome_lengths(fasta)
|
|
315
391
|
########################################################################################################################
|
|
@@ -370,13 +446,13 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
370
446
|
logger.debug(f"{aligned_sorted_output} already exists. Using existing aligned/sorted BAM.")
|
|
371
447
|
else:
|
|
372
448
|
logger.info(f"Aligning and sorting reads")
|
|
373
|
-
align_and_sort_BAM(fasta, unaligned_output, cfg)
|
|
449
|
+
align_and_sort_BAM(fasta, unaligned_output, aligned_output, cfg)
|
|
374
450
|
# Deleted the unsorted aligned output
|
|
375
451
|
aligned_output.unlink()
|
|
376
452
|
|
|
377
453
|
if cfg.make_beds:
|
|
378
454
|
# Make beds and provide basic histograms
|
|
379
|
-
bed_dir =
|
|
455
|
+
bed_dir = load_directory / "beds"
|
|
380
456
|
if bed_dir.is_dir():
|
|
381
457
|
logger.debug(
|
|
382
458
|
f"{bed_dir} already exists. Skipping BAM -> BED conversion for {aligned_sorted_output}"
|
|
@@ -384,7 +460,14 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
384
460
|
else:
|
|
385
461
|
logger.info("Making bed files from the aligned and sorted BAM file")
|
|
386
462
|
aligned_BAM_to_bed(
|
|
387
|
-
aligned_sorted_output,
|
|
463
|
+
aligned_sorted_output,
|
|
464
|
+
load_directory,
|
|
465
|
+
fasta,
|
|
466
|
+
cfg.make_bigwigs,
|
|
467
|
+
cfg.threads,
|
|
468
|
+
samtools_backend=cfg.samtools_backend,
|
|
469
|
+
bedtools_backend=cfg.bedtools_backend,
|
|
470
|
+
bigwig_backend=cfg.bigwig_backend,
|
|
388
471
|
)
|
|
389
472
|
########################################################################################################################
|
|
390
473
|
|
|
@@ -404,13 +487,19 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
404
487
|
else:
|
|
405
488
|
make_dirs([cfg.split_path])
|
|
406
489
|
logger.info("Demultiplexing samples into individual aligned/sorted BAM files")
|
|
407
|
-
all_bam_files = split_and_index_BAM(
|
|
490
|
+
all_bam_files = split_and_index_BAM(
|
|
491
|
+
aligned_sorted_BAM,
|
|
492
|
+
cfg.split_path,
|
|
493
|
+
cfg.bam_suffix,
|
|
494
|
+
samtools_backend=cfg.samtools_backend,
|
|
495
|
+
)
|
|
408
496
|
|
|
409
497
|
unclassified_bams = [p for p in all_bam_files if "unclassified" in p.name]
|
|
410
498
|
bam_files = sorted(p for p in all_bam_files if "unclassified" not in p.name)
|
|
411
499
|
|
|
412
500
|
se_bam_files = bam_files
|
|
413
501
|
bam_dir = cfg.split_path
|
|
502
|
+
double_barcoded_path = None
|
|
414
503
|
|
|
415
504
|
else:
|
|
416
505
|
if single_barcoded_path.is_dir():
|
|
@@ -489,19 +578,34 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
489
578
|
else:
|
|
490
579
|
logger.info("Making BED files from BAM files for each sample")
|
|
491
580
|
for bam in bam_files:
|
|
492
|
-
aligned_BAM_to_bed(
|
|
581
|
+
aligned_BAM_to_bed(
|
|
582
|
+
bam,
|
|
583
|
+
cfg.split_path,
|
|
584
|
+
fasta,
|
|
585
|
+
cfg.make_bigwigs,
|
|
586
|
+
cfg.threads,
|
|
587
|
+
samtools_backend=cfg.samtools_backend,
|
|
588
|
+
bedtools_backend=cfg.bedtools_backend,
|
|
589
|
+
bigwig_backend=cfg.bigwig_backend,
|
|
590
|
+
)
|
|
493
591
|
########################################################################################################################
|
|
494
592
|
|
|
495
593
|
################################### 6) SAMTools based BAM QC ######################################################################
|
|
496
594
|
|
|
497
595
|
# 5) Samtools QC metrics on split BAM files
|
|
498
|
-
bam_qc_dir =
|
|
596
|
+
bam_qc_dir = load_directory / "bam_qc"
|
|
499
597
|
if bam_qc_dir.is_dir():
|
|
500
598
|
logger.debug(f"{bam_qc_dir} already exists. Using existing BAM QC calculations.")
|
|
501
599
|
else:
|
|
502
600
|
make_dirs([bam_qc_dir])
|
|
503
601
|
logger.info("Performing BAM QC")
|
|
504
|
-
bam_qc(
|
|
602
|
+
bam_qc(
|
|
603
|
+
bam_files,
|
|
604
|
+
bam_qc_dir,
|
|
605
|
+
cfg.threads,
|
|
606
|
+
modality=cfg.smf_modality,
|
|
607
|
+
samtools_backend=cfg.samtools_backend,
|
|
608
|
+
)
|
|
505
609
|
########################################################################################################################
|
|
506
610
|
|
|
507
611
|
################################### 7) AnnData loading ######################################################################
|
|
@@ -518,7 +622,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
518
622
|
raw_adata, raw_adata_path = converted_BAM_to_adata(
|
|
519
623
|
fasta,
|
|
520
624
|
bam_dir,
|
|
521
|
-
|
|
625
|
+
load_directory,
|
|
522
626
|
cfg.input_already_demuxed,
|
|
523
627
|
cfg.mapping_threshold,
|
|
524
628
|
cfg.experiment_name,
|
|
@@ -529,6 +633,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
529
633
|
deaminase_footprinting,
|
|
530
634
|
delete_intermediates=cfg.delete_intermediate_hdfs,
|
|
531
635
|
double_barcoded_path=double_barcoded_path,
|
|
636
|
+
samtools_backend=cfg.samtools_backend,
|
|
532
637
|
)
|
|
533
638
|
else:
|
|
534
639
|
if mod_bed_dir.is_dir():
|
|
@@ -574,7 +679,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
574
679
|
raw_adata, raw_adata_path = modkit_extract_to_adata(
|
|
575
680
|
fasta,
|
|
576
681
|
bam_dir,
|
|
577
|
-
|
|
682
|
+
load_directory,
|
|
578
683
|
cfg.input_already_demuxed,
|
|
579
684
|
cfg.mapping_threshold,
|
|
580
685
|
cfg.experiment_name,
|
|
@@ -584,6 +689,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
584
689
|
cfg.delete_batch_hdfs,
|
|
585
690
|
cfg.threads,
|
|
586
691
|
double_barcoded_path,
|
|
692
|
+
cfg.samtools_backend,
|
|
587
693
|
)
|
|
588
694
|
if cfg.delete_intermediate_tsvs:
|
|
589
695
|
delete_tsvs(mod_tsv_dir)
|
|
@@ -604,8 +710,28 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
604
710
|
extract_read_features_from_bam_callable=extract_read_features_from_bam,
|
|
605
711
|
bypass=cfg.bypass_add_read_length_and_mapping_qc,
|
|
606
712
|
force_redo=cfg.force_redo_add_read_length_and_mapping_qc,
|
|
713
|
+
samtools_backend=cfg.samtools_backend,
|
|
714
|
+
)
|
|
715
|
+
|
|
716
|
+
logger.info("Adding BAM tags and BAM flags to adata.obs")
|
|
717
|
+
add_read_tag_annotations(
|
|
718
|
+
raw_adata,
|
|
719
|
+
se_bam_files,
|
|
720
|
+
tag_names=getattr(cfg, "bam_tag_names", ["NM", "MD", "MM", "ML"]),
|
|
721
|
+
include_flags=True,
|
|
722
|
+
include_cigar=True,
|
|
723
|
+
extract_read_tags_from_bam_callable=extract_read_tags_from_bam,
|
|
724
|
+
samtools_backend=cfg.samtools_backend,
|
|
607
725
|
)
|
|
608
726
|
|
|
727
|
+
if getattr(cfg, "annotate_secondary_supplementary", False):
|
|
728
|
+
logger.info("Annotating secondary/supplementary alignments from aligned BAM")
|
|
729
|
+
add_secondary_supplementary_alignment_flags(
|
|
730
|
+
raw_adata,
|
|
731
|
+
aligned_sorted_output,
|
|
732
|
+
samtools_backend=cfg.samtools_backend,
|
|
733
|
+
)
|
|
734
|
+
|
|
609
735
|
raw_adata.obs["Raw_modification_signal"] = np.nansum(raw_adata.X, axis=1)
|
|
610
736
|
########################################################################################################################
|
|
611
737
|
|
|
@@ -618,7 +744,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
618
744
|
raw_adata,
|
|
619
745
|
cfg.input_data_path,
|
|
620
746
|
n_jobs=cfg.threads,
|
|
621
|
-
csv_path=
|
|
747
|
+
csv_path=load_directory / "read_to_pod5_origin_mapping.csv",
|
|
622
748
|
)
|
|
623
749
|
########################################################################################################################
|
|
624
750
|
|
|
@@ -637,12 +763,12 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
637
763
|
############################################### MultiQC HTML Report ###############################################
|
|
638
764
|
|
|
639
765
|
# multiqc ###
|
|
640
|
-
mqc_dir =
|
|
766
|
+
mqc_dir = load_directory / "multiqc"
|
|
641
767
|
if mqc_dir.is_dir():
|
|
642
|
-
logger.
|
|
768
|
+
logger.info(f"{mqc_dir} already exists, skipping multiqc")
|
|
643
769
|
else:
|
|
644
770
|
logger.info("Running multiqc")
|
|
645
|
-
run_multiqc(
|
|
771
|
+
run_multiqc(bam_qc_dir, mqc_dir)
|
|
646
772
|
########################################################################################################################
|
|
647
773
|
|
|
648
774
|
############################################### delete intermediate BAM files ###############################################
|
|
@@ -665,93 +791,3 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
665
791
|
########################################################################################################################
|
|
666
792
|
|
|
667
793
|
return raw_adata, raw_adata_path, cfg
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
def load_adata(config_path: str):
|
|
671
|
-
"""
|
|
672
|
-
CLI-facing wrapper for the load pipeline.
|
|
673
|
-
|
|
674
|
-
- Reads config CSV into ExperimentConfig
|
|
675
|
-
- Computes canonical paths for all downstream AnnData stages
|
|
676
|
-
- Registers those in the summary CSV
|
|
677
|
-
- Applies stage-skipping logic (hmm > spatial > pp_dedup > pp > raw)
|
|
678
|
-
- If needed, calls the core pipeline to actually build the raw AnnData
|
|
679
|
-
|
|
680
|
-
Returns
|
|
681
|
-
-------
|
|
682
|
-
adata : anndata.AnnData | None
|
|
683
|
-
Newly created AnnData object, or None if we skipped because a later-stage
|
|
684
|
-
AnnData already exists.
|
|
685
|
-
adata_path : pathlib.Path
|
|
686
|
-
Path to the "current" AnnData that should be used downstream.
|
|
687
|
-
cfg : ExperimentConfig
|
|
688
|
-
Config object for downstream steps.
|
|
689
|
-
"""
|
|
690
|
-
from datetime import datetime
|
|
691
|
-
from importlib import resources
|
|
692
|
-
|
|
693
|
-
from ..config import ExperimentConfig, LoadExperimentConfig
|
|
694
|
-
from ..readwrite import add_or_update_column_in_csv, make_dirs
|
|
695
|
-
from .helpers import get_adata_paths
|
|
696
|
-
|
|
697
|
-
date_str = datetime.today().strftime("%y%m%d")
|
|
698
|
-
|
|
699
|
-
# -----------------------------
|
|
700
|
-
# 1) Load config into cfg
|
|
701
|
-
# -----------------------------
|
|
702
|
-
loader = LoadExperimentConfig(config_path)
|
|
703
|
-
defaults_dir = resources.files("smftools").joinpath("config")
|
|
704
|
-
cfg, report = ExperimentConfig.from_var_dict(
|
|
705
|
-
loader.var_dict, date_str=date_str, defaults_dir=defaults_dir
|
|
706
|
-
)
|
|
707
|
-
|
|
708
|
-
# Ensure base output dir
|
|
709
|
-
make_dirs([cfg.output_directory])
|
|
710
|
-
|
|
711
|
-
# -----------------------------
|
|
712
|
-
# 2) Compute and register paths
|
|
713
|
-
# -----------------------------
|
|
714
|
-
paths = get_adata_paths(cfg)
|
|
715
|
-
|
|
716
|
-
# experiment-level metadata in summary CSV
|
|
717
|
-
add_or_update_column_in_csv(cfg.summary_file, "experiment_name", cfg.experiment_name)
|
|
718
|
-
add_or_update_column_in_csv(cfg.summary_file, "config_path", config_path)
|
|
719
|
-
add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
|
|
720
|
-
add_or_update_column_in_csv(cfg.summary_file, "input_files", [cfg.input_files])
|
|
721
|
-
|
|
722
|
-
# AnnData stage paths
|
|
723
|
-
add_or_update_column_in_csv(cfg.summary_file, "load_adata", paths.raw)
|
|
724
|
-
add_or_update_column_in_csv(cfg.summary_file, "pp_adata", paths.pp)
|
|
725
|
-
add_or_update_column_in_csv(cfg.summary_file, "pp_dedup_adata", paths.pp_dedup)
|
|
726
|
-
add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", paths.spatial)
|
|
727
|
-
add_or_update_column_in_csv(cfg.summary_file, "hmm_adata", paths.hmm)
|
|
728
|
-
|
|
729
|
-
# -----------------------------
|
|
730
|
-
# 3) Stage skipping logic
|
|
731
|
-
# -----------------------------
|
|
732
|
-
if not getattr(cfg, "force_redo_load_adata", False):
|
|
733
|
-
if paths.hmm.exists():
|
|
734
|
-
logger.debug(f"HMM AnnData already exists: {paths.hmm}\nSkipping smftools load")
|
|
735
|
-
return None, paths.hmm, cfg
|
|
736
|
-
if paths.spatial.exists():
|
|
737
|
-
logger.debug(f"Spatial AnnData already exists: {paths.spatial}\nSkipping smftools load")
|
|
738
|
-
return None, paths.spatial, cfg
|
|
739
|
-
if paths.pp_dedup.exists():
|
|
740
|
-
logger.debug(
|
|
741
|
-
f"Preprocessed deduplicated AnnData already exists: {paths.pp_dedup}\n"
|
|
742
|
-
f"Skipping smftools load"
|
|
743
|
-
)
|
|
744
|
-
return None, paths.pp_dedup, cfg
|
|
745
|
-
if paths.pp.exists():
|
|
746
|
-
logger.debug(f"Preprocessed AnnData already exists: {paths.pp}\nSkipping smftools load")
|
|
747
|
-
return None, paths.pp, cfg
|
|
748
|
-
if paths.raw.exists():
|
|
749
|
-
logger.debug(
|
|
750
|
-
f"Raw AnnData from smftools load already exists: {paths.raw}\nSkipping smftools load"
|
|
751
|
-
)
|
|
752
|
-
return None, paths.raw, cfg
|
|
753
|
-
|
|
754
|
-
# If we get here, we actually want to run the full load pipeline
|
|
755
|
-
adata, adata_path, cfg = load_adata_core(cfg, paths, config_path=config_path)
|
|
756
|
-
|
|
757
|
-
return adata, adata_path, cfg
|