smftools 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +6 -8
- smftools/_settings.py +4 -6
- smftools/_version.py +1 -1
- smftools/cli/helpers.py +54 -0
- smftools/cli/hmm_adata.py +937 -256
- smftools/cli/load_adata.py +448 -268
- smftools/cli/preprocess_adata.py +469 -263
- smftools/cli/spatial_adata.py +536 -319
- smftools/cli_entry.py +97 -182
- smftools/config/__init__.py +1 -1
- smftools/config/conversion.yaml +17 -6
- smftools/config/deaminase.yaml +12 -10
- smftools/config/default.yaml +142 -33
- smftools/config/direct.yaml +11 -3
- smftools/config/discover_input_files.py +19 -5
- smftools/config/experiment_config.py +594 -264
- smftools/constants.py +37 -0
- smftools/datasets/__init__.py +2 -8
- smftools/datasets/datasets.py +32 -18
- smftools/hmm/HMM.py +2128 -1418
- smftools/hmm/__init__.py +2 -9
- smftools/hmm/archived/call_hmm_peaks.py +121 -0
- smftools/hmm/call_hmm_peaks.py +299 -91
- smftools/hmm/display_hmm.py +19 -6
- smftools/hmm/hmm_readwrite.py +13 -4
- smftools/hmm/nucleosome_hmm_refinement.py +102 -14
- smftools/informatics/__init__.py +30 -7
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
- smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
- smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
- smftools/informatics/archived/print_bam_query_seq.py +7 -1
- smftools/informatics/bam_functions.py +397 -175
- smftools/informatics/basecalling.py +51 -9
- smftools/informatics/bed_functions.py +90 -57
- smftools/informatics/binarize_converted_base_identities.py +18 -7
- smftools/informatics/complement_base_list.py +7 -6
- smftools/informatics/converted_BAM_to_adata.py +265 -122
- smftools/informatics/fasta_functions.py +161 -83
- smftools/informatics/h5ad_functions.py +196 -30
- smftools/informatics/modkit_extract_to_adata.py +609 -270
- smftools/informatics/modkit_functions.py +85 -44
- smftools/informatics/ohe.py +44 -21
- smftools/informatics/pod5_functions.py +112 -73
- smftools/informatics/run_multiqc.py +20 -14
- smftools/logging_utils.py +51 -0
- smftools/machine_learning/__init__.py +2 -7
- smftools/machine_learning/data/anndata_data_module.py +143 -50
- smftools/machine_learning/data/preprocessing.py +2 -1
- smftools/machine_learning/evaluation/__init__.py +1 -1
- smftools/machine_learning/evaluation/eval_utils.py +11 -14
- smftools/machine_learning/evaluation/evaluators.py +46 -33
- smftools/machine_learning/inference/__init__.py +1 -1
- smftools/machine_learning/inference/inference_utils.py +7 -4
- smftools/machine_learning/inference/lightning_inference.py +9 -13
- smftools/machine_learning/inference/sklearn_inference.py +6 -8
- smftools/machine_learning/inference/sliding_window_inference.py +35 -25
- smftools/machine_learning/models/__init__.py +10 -5
- smftools/machine_learning/models/base.py +28 -42
- smftools/machine_learning/models/cnn.py +15 -11
- smftools/machine_learning/models/lightning_base.py +71 -40
- smftools/machine_learning/models/mlp.py +13 -4
- smftools/machine_learning/models/positional.py +3 -2
- smftools/machine_learning/models/rnn.py +3 -2
- smftools/machine_learning/models/sklearn_models.py +39 -22
- smftools/machine_learning/models/transformer.py +68 -53
- smftools/machine_learning/models/wrappers.py +2 -1
- smftools/machine_learning/training/__init__.py +2 -2
- smftools/machine_learning/training/train_lightning_model.py +29 -20
- smftools/machine_learning/training/train_sklearn_model.py +9 -15
- smftools/machine_learning/utils/__init__.py +1 -1
- smftools/machine_learning/utils/device.py +7 -4
- smftools/machine_learning/utils/grl.py +3 -1
- smftools/metadata.py +443 -0
- smftools/plotting/__init__.py +19 -5
- smftools/plotting/autocorrelation_plotting.py +145 -44
- smftools/plotting/classifiers.py +162 -72
- smftools/plotting/general_plotting.py +422 -197
- smftools/plotting/hmm_plotting.py +42 -13
- smftools/plotting/position_stats.py +147 -87
- smftools/plotting/qc_plotting.py +20 -12
- smftools/preprocessing/__init__.py +10 -12
- smftools/preprocessing/append_base_context.py +115 -80
- smftools/preprocessing/append_binary_layer_by_base_context.py +77 -39
- smftools/preprocessing/{calculate_complexity.py → archived/calculate_complexity.py} +3 -1
- smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
- smftools/preprocessing/binarize.py +21 -4
- smftools/preprocessing/binarize_on_Youden.py +129 -31
- smftools/preprocessing/binary_layers_to_ohe.py +17 -11
- smftools/preprocessing/calculate_complexity_II.py +86 -59
- smftools/preprocessing/calculate_consensus.py +28 -19
- smftools/preprocessing/calculate_coverage.py +50 -25
- smftools/preprocessing/calculate_pairwise_differences.py +2 -1
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
- smftools/preprocessing/calculate_position_Youden.py +118 -54
- smftools/preprocessing/calculate_read_length_stats.py +52 -23
- smftools/preprocessing/calculate_read_modification_stats.py +91 -57
- smftools/preprocessing/clean_NaN.py +38 -28
- smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +71 -38
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
- smftools/preprocessing/flag_duplicate_reads.py +689 -272
- smftools/preprocessing/invert_adata.py +26 -11
- smftools/preprocessing/load_sample_sheet.py +40 -22
- smftools/preprocessing/make_dirs.py +8 -3
- smftools/preprocessing/min_non_diagonal.py +2 -1
- smftools/preprocessing/recipes.py +56 -23
- smftools/preprocessing/reindex_references_adata.py +103 -0
- smftools/preprocessing/subsample_adata.py +33 -16
- smftools/readwrite.py +331 -82
- smftools/schema/__init__.py +11 -0
- smftools/schema/anndata_schema_v1.yaml +227 -0
- smftools/tools/__init__.py +3 -4
- smftools/tools/archived/classifiers.py +163 -0
- smftools/tools/archived/subset_adata_v1.py +10 -1
- smftools/tools/archived/subset_adata_v2.py +12 -1
- smftools/tools/calculate_umap.py +54 -15
- smftools/tools/cluster_adata_on_methylation.py +115 -46
- smftools/tools/general_tools.py +70 -25
- smftools/tools/position_stats.py +229 -98
- smftools/tools/read_stats.py +50 -29
- smftools/tools/spatial_autocorrelation.py +365 -192
- smftools/tools/subset_adata.py +23 -21
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/METADATA +17 -39
- smftools-0.2.5.dist-info/RECORD +181 -0
- smftools-0.2.3.dist-info/RECORD +0 -173
- /smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
- /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
- /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
- /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
- /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archived/add_read_length_and_mapping_qc.py} +0 -0
- /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
- /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
smftools/cli/load_adata.py
CHANGED
|
@@ -1,11 +1,19 @@
|
|
|
1
1
|
import shutil
|
|
2
2
|
from pathlib import Path
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import Iterable, Union
|
|
4
|
+
|
|
5
|
+
from smftools.logging_utils import get_logger
|
|
6
|
+
|
|
7
|
+
from .helpers import AdataPaths
|
|
8
|
+
|
|
9
|
+
logger = get_logger(__name__)
|
|
10
|
+
|
|
4
11
|
|
|
5
12
|
def check_executable_exists(cmd: str) -> bool:
|
|
6
13
|
"""Return True if a command-line executable is available in PATH."""
|
|
7
14
|
return shutil.which(cmd) is not None
|
|
8
15
|
|
|
16
|
+
|
|
9
17
|
def delete_tsvs(
|
|
10
18
|
tsv_dir: Union[str, Path, Iterable[str], None],
|
|
11
19
|
*,
|
|
@@ -25,158 +33,130 @@ def delete_tsvs(
|
|
|
25
33
|
verbose : bool
|
|
26
34
|
Print progress / warnings.
|
|
27
35
|
"""
|
|
36
|
+
|
|
28
37
|
# Helper: remove a single file path (Path-like or string)
|
|
29
38
|
def _maybe_unlink(p: Path):
|
|
30
39
|
if not p.exists():
|
|
31
40
|
if verbose:
|
|
32
|
-
|
|
41
|
+
logger.info(f"[skip] not found: {p}")
|
|
33
42
|
return
|
|
34
43
|
if not p.is_file():
|
|
35
44
|
if verbose:
|
|
36
|
-
|
|
45
|
+
logger.info(f"[skip] not a file: {p}")
|
|
37
46
|
return
|
|
38
47
|
if dry_run:
|
|
39
|
-
|
|
48
|
+
logger.info(f"[dry-run] would remove file: {p}")
|
|
40
49
|
return
|
|
41
50
|
try:
|
|
42
51
|
p.unlink()
|
|
43
52
|
if verbose:
|
|
44
|
-
|
|
53
|
+
logger.info(f"Removed file: {p}")
|
|
45
54
|
except Exception as e:
|
|
46
|
-
|
|
55
|
+
logger.warning(f"Failed to remove file {p}: {e}")
|
|
47
56
|
|
|
48
57
|
# Remove tmp_dir recursively (if provided)
|
|
49
58
|
if tsv_dir is not None:
|
|
50
59
|
td = Path(tsv_dir)
|
|
51
60
|
if not td.exists():
|
|
52
61
|
if verbose:
|
|
53
|
-
|
|
62
|
+
logger.info(f"[skip] tsv_dir not found: {td}")
|
|
54
63
|
else:
|
|
55
64
|
if not td.is_dir():
|
|
56
65
|
if verbose:
|
|
57
|
-
|
|
66
|
+
logger.info(f"[skip] tsv_dir is not a directory: {td}")
|
|
58
67
|
else:
|
|
59
68
|
if dry_run:
|
|
60
|
-
|
|
69
|
+
logger.info(f"[dry-run] would remove directory tree: {td}")
|
|
61
70
|
else:
|
|
62
71
|
try:
|
|
63
72
|
shutil.rmtree(td)
|
|
64
73
|
if verbose:
|
|
65
|
-
|
|
74
|
+
logger.info(f"Removed directory tree: {td}")
|
|
66
75
|
except Exception as e:
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
def load_adata(config_path):
|
|
70
|
-
"""
|
|
71
|
-
High-level function to call for converting raw sequencing data to an adata object.
|
|
72
|
-
Command line accesses this through smftools load <config_path>
|
|
73
|
-
Works for nanopore pod5, fast5, and unaligned modBAM data types for direct SMF workflows.
|
|
74
|
-
Works for nanopore pod5, fast5, unaligned BAM for conversion SMF workflows.
|
|
75
|
-
Also works for illumina fastq and unaligned BAM for conversion SMF workflows.
|
|
76
|
+
logger.warning(f"[error] failed to remove tmp dir {td}: {e}")
|
|
76
77
|
|
|
77
|
-
Parameters:
|
|
78
|
-
config_path (str): A string representing the file path to the experiment configuration csv file.
|
|
79
78
|
|
|
80
|
-
|
|
81
|
-
adata, adata_path, se_bam_files, cfg
|
|
79
|
+
def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
82
80
|
"""
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
81
|
+
Core load pipeline.
|
|
82
|
+
|
|
83
|
+
Assumes:
|
|
84
|
+
- cfg is a fully initialized ExperimentConfig
|
|
85
|
+
- paths is an AdataPaths object describing canonical h5ad stage paths
|
|
86
|
+
- No stage-skipping or early returns based on existing AnnDatas are done here
|
|
87
|
+
(that happens in the wrapper).
|
|
88
|
+
|
|
89
|
+
Does:
|
|
90
|
+
- handle input format (fast5/pod5/fastq/bam/h5ad)
|
|
91
|
+
- basecalling / alignment / demux / BAM QC
|
|
92
|
+
- optional bed + bigwig generation
|
|
93
|
+
- AnnData construction (conversion or direct modality)
|
|
94
|
+
- basic read-level QC annotations
|
|
95
|
+
- write raw AnnData to paths.raw
|
|
96
|
+
- run MultiQC
|
|
97
|
+
- optional deletion of intermediate BAMs
|
|
98
|
+
|
|
99
|
+
Returns
|
|
100
|
+
-------
|
|
101
|
+
raw_adata : anndata.AnnData
|
|
102
|
+
Newly created raw AnnData object.
|
|
103
|
+
raw_adata_path : Path
|
|
104
|
+
Path where the raw AnnData was written (paths.raw).
|
|
105
|
+
cfg : ExperimentConfig
|
|
106
|
+
(Same object, possibly with some fields updated, e.g. fasta path.)
|
|
107
|
+
"""
|
|
108
|
+
from pathlib import Path
|
|
88
109
|
|
|
89
110
|
import numpy as np
|
|
90
|
-
import pandas as pd
|
|
91
|
-
import anndata as ad
|
|
92
|
-
import scanpy as sc
|
|
93
|
-
|
|
94
|
-
import os
|
|
95
|
-
from importlib import resources
|
|
96
|
-
from pathlib import Path
|
|
97
111
|
|
|
98
|
-
from
|
|
99
|
-
|
|
112
|
+
from ..informatics.bam_functions import (
|
|
113
|
+
align_and_sort_BAM,
|
|
114
|
+
bam_qc,
|
|
115
|
+
concatenate_fastqs_to_bam,
|
|
116
|
+
demux_and_index_BAM,
|
|
117
|
+
extract_read_features_from_bam,
|
|
118
|
+
split_and_index_BAM,
|
|
119
|
+
)
|
|
120
|
+
from ..informatics.basecalling import canoncall, modcall
|
|
121
|
+
from ..informatics.bed_functions import aligned_BAM_to_bed
|
|
122
|
+
from ..informatics.converted_BAM_to_adata import converted_BAM_to_adata
|
|
123
|
+
from ..informatics.fasta_functions import (
|
|
124
|
+
generate_converted_FASTA,
|
|
125
|
+
get_chromosome_lengths,
|
|
126
|
+
subsample_fasta_from_bed,
|
|
127
|
+
)
|
|
128
|
+
from ..informatics.h5ad_functions import add_read_length_and_mapping_qc
|
|
129
|
+
from ..informatics.modkit_extract_to_adata import modkit_extract_to_adata
|
|
130
|
+
from ..informatics.modkit_functions import extract_mods, make_modbed, modQC
|
|
131
|
+
from ..informatics.pod5_functions import fast5_to_pod5
|
|
132
|
+
from ..informatics.run_multiqc import run_multiqc
|
|
133
|
+
from ..metadata import record_smftools_metadata
|
|
134
|
+
from ..readwrite import add_or_update_column_in_csv, make_dirs
|
|
135
|
+
from .helpers import write_gz_h5ad
|
|
100
136
|
|
|
101
137
|
################################### 1) General params and input organization ###################################
|
|
138
|
+
output_directory = Path(cfg.output_directory)
|
|
139
|
+
make_dirs([output_directory])
|
|
102
140
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
# Make initial output directory
|
|
109
|
-
make_dirs([cfg.output_directory])
|
|
110
|
-
|
|
111
|
-
# Make a csv that contains experiment summary file paths
|
|
112
|
-
add_or_update_column_in_csv(cfg.summary_file, "experiment_name", cfg.experiment_name)
|
|
113
|
-
add_or_update_column_in_csv(cfg.summary_file, "config_path", config_path)
|
|
114
|
-
add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
|
|
115
|
-
add_or_update_column_in_csv(cfg.summary_file, "input_files", [cfg.input_files])
|
|
116
|
-
|
|
117
|
-
# Initial h5ad file naming
|
|
118
|
-
h5_dir = cfg.output_directory / 'h5ads'
|
|
119
|
-
raw_adata_path = h5_dir / f'{cfg.experiment_name}.h5ad.gz'
|
|
120
|
-
|
|
121
|
-
# Preprocessed adata path info
|
|
122
|
-
pp_adata_basename = raw_adata_path.name.split(".")[0] + '_preprocessed.h5ad.gz'
|
|
123
|
-
pp_adata_path = raw_adata_path.parent / pp_adata_basename
|
|
124
|
-
|
|
125
|
-
# Preprocessed duplicate removed adata path info
|
|
126
|
-
if cfg.smf_modality == 'direct':
|
|
127
|
-
# For direct SMF, link the duplicate removed version just to the preprocessed version, since there is not a duplicate removal step for direct workflow
|
|
128
|
-
pp_dup_rem_adata_path = pp_adata_path
|
|
129
|
-
else:
|
|
130
|
-
pp_dup_rem_adata_basename = pp_adata_path.name.split(".")[0] + '_duplicates_removed.h5ad.gz'
|
|
131
|
-
pp_dup_rem_adata_path = pp_adata_path.parent / pp_dup_rem_adata_basename
|
|
132
|
-
|
|
133
|
-
# Preprocessed duplicate removed adata with basic analyses appended path info
|
|
134
|
-
spatial_adata_basename = pp_dup_rem_adata_path.name.split(".")[0] + '_spatial.h5ad.gz'
|
|
135
|
-
spatial_adata_path = pp_dup_rem_adata_path.parent / spatial_adata_basename
|
|
136
|
-
|
|
137
|
-
# hmm adata
|
|
138
|
-
hmm_adata_basename = spatial_adata_path.name.split(".")[0] + '_hmm.h5ad.gz'
|
|
139
|
-
hmm_adata_path = spatial_adata_path.parent / hmm_adata_basename
|
|
140
|
-
|
|
141
|
-
add_or_update_column_in_csv(cfg.summary_file, "load_adata", raw_adata_path)
|
|
142
|
-
add_or_update_column_in_csv(cfg.summary_file, "pp_adata", pp_adata_path)
|
|
143
|
-
add_or_update_column_in_csv(cfg.summary_file, "pp_dedup_adata", pp_dup_rem_adata_path)
|
|
144
|
-
add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", spatial_adata_path)
|
|
145
|
-
add_or_update_column_in_csv(cfg.summary_file, "hmm_adata", hmm_adata_path)
|
|
146
|
-
|
|
147
|
-
if cfg.force_redo_load_adata:
|
|
148
|
-
pass
|
|
149
|
-
elif hmm_adata_path.exists():
|
|
150
|
-
print(f"HMM AnnData already exists: {hmm_adata_path}\n Skipping smftools load")
|
|
151
|
-
return None, hmm_adata_path, cfg
|
|
152
|
-
elif spatial_adata_path.exists():
|
|
153
|
-
print(f"Spatial AnnData already exists: {spatial_adata_path}\n Skipping smftools load")
|
|
154
|
-
return None, spatial_adata_path, cfg
|
|
155
|
-
elif pp_dup_rem_adata_path.exists():
|
|
156
|
-
print(f"Preprocessed deduplicated AnnData already exists: {pp_dup_rem_adata_path}\n Skipping smftools load")
|
|
157
|
-
return None, pp_dup_rem_adata_path, cfg
|
|
158
|
-
elif pp_adata_path.exists():
|
|
159
|
-
print(f"Preprocessed Anndata already exists: {pp_adata_path}\n Skipping smftools load")
|
|
160
|
-
return None, pp_adata_path, cfg
|
|
161
|
-
elif raw_adata_path.exists():
|
|
162
|
-
print(f"Anndata from smftools load already exists: {raw_adata_path}\n Skipping smftools load")
|
|
163
|
-
return None, raw_adata_path, cfg
|
|
164
|
-
else:
|
|
165
|
-
pass
|
|
141
|
+
raw_adata_path = paths.raw
|
|
142
|
+
pp_adata_path = paths.pp
|
|
143
|
+
pp_dup_rem_adata_path = paths.pp_dedup
|
|
144
|
+
spatial_adata_path = paths.spatial
|
|
145
|
+
hmm_adata_path = paths.hmm
|
|
166
146
|
|
|
167
147
|
# Naming of the demultiplexed output directory
|
|
168
148
|
double_barcoded_path = cfg.split_path / "both_ends_barcoded"
|
|
169
149
|
single_barcoded_path = cfg.split_path / "at_least_one_end_barcoded"
|
|
170
150
|
|
|
171
151
|
# Direct methylation detection SMF specific parameters
|
|
172
|
-
if cfg.smf_modality ==
|
|
152
|
+
if cfg.smf_modality == "direct":
|
|
173
153
|
mod_bed_dir = cfg.output_directory / "mod_beds"
|
|
174
154
|
add_or_update_column_in_csv(cfg.summary_file, "mod_bed_dir", mod_bed_dir)
|
|
175
155
|
mod_tsv_dir = cfg.output_directory / "mod_tsvs"
|
|
176
156
|
add_or_update_column_in_csv(cfg.summary_file, "mod_tsv_dir", mod_tsv_dir)
|
|
177
157
|
bam_qc_dir = cfg.output_directory / "bam_qc"
|
|
178
|
-
|
|
179
|
-
|
|
158
|
+
mods = [cfg.mod_map[mod] for mod in cfg.mod_list]
|
|
159
|
+
|
|
180
160
|
if not check_executable_exists("dorado"):
|
|
181
161
|
raise RuntimeError(
|
|
182
162
|
"Error: 'dorado' is not installed or not in PATH. "
|
|
@@ -188,9 +168,12 @@ def load_adata(config_path):
|
|
|
188
168
|
"Install from https://github.com/nanoporetech/modkit"
|
|
189
169
|
)
|
|
190
170
|
else:
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
171
|
+
mod_bed_dir = None
|
|
172
|
+
mod_tsv_dir = None
|
|
173
|
+
mods = None
|
|
174
|
+
|
|
175
|
+
# demux / aligner executables
|
|
176
|
+
if (not cfg.input_already_demuxed) or cfg.aligner == "dorado":
|
|
194
177
|
if not check_executable_exists("dorado"):
|
|
195
178
|
raise RuntimeError(
|
|
196
179
|
"Error: 'dorado' is not installed or not in PATH. "
|
|
@@ -200,42 +183,45 @@ def load_adata(config_path):
|
|
|
200
183
|
if cfg.aligner == "minimap2":
|
|
201
184
|
if not check_executable_exists("minimap2"):
|
|
202
185
|
raise RuntimeError(
|
|
203
|
-
"Error: 'minimap2' is not installed or not in PATH. "
|
|
204
|
-
"Install minimap2"
|
|
186
|
+
"Error: 'minimap2' is not installed or not in PATH. Install minimap2"
|
|
205
187
|
)
|
|
206
188
|
|
|
207
189
|
# # Detect the input filetypes
|
|
208
190
|
# If the input files are fast5 files, convert the files to a pod5 file before proceeding.
|
|
209
191
|
if cfg.input_type == "fast5":
|
|
210
192
|
# take the input directory of fast5 files and write out a single pod5 file into the output directory.
|
|
211
|
-
output_pod5 = cfg.output_directory /
|
|
193
|
+
output_pod5 = cfg.output_directory / "FAST5s_to_POD5.pod5"
|
|
212
194
|
if output_pod5.exists():
|
|
213
195
|
pass
|
|
214
196
|
else:
|
|
215
|
-
|
|
197
|
+
logger.info(
|
|
198
|
+
f"Input directory contains fast5 files, converting them and concatenating into a single pod5 file in the {output_pod5}"
|
|
199
|
+
)
|
|
216
200
|
fast5_to_pod5(cfg.input_data_path, output_pod5)
|
|
217
201
|
# Reassign the pod5_dir variable to point to the new pod5 file.
|
|
218
202
|
cfg.input_data_path = output_pod5
|
|
219
|
-
cfg.input_type
|
|
203
|
+
cfg.input_type = "pod5"
|
|
220
204
|
# If the input is a fastq or a directory of fastqs, concatenate them into an unaligned BAM and save the barcode
|
|
221
205
|
elif cfg.input_type == "fastq":
|
|
222
206
|
# Output file for FASTQ concatenation.
|
|
223
|
-
output_bam = cfg.output_directory /
|
|
207
|
+
output_bam = cfg.output_directory / "canonical_basecalls.bam"
|
|
224
208
|
if output_bam.exists():
|
|
225
|
-
|
|
209
|
+
logger.debug("Output BAM already exists")
|
|
226
210
|
else:
|
|
211
|
+
logger.info("Concatenating FASTQ files into a single BAM file")
|
|
227
212
|
summary = concatenate_fastqs_to_bam(
|
|
228
213
|
cfg.input_files,
|
|
229
214
|
output_bam,
|
|
230
|
-
barcode_tag=
|
|
231
|
-
gzip_suffixes=(
|
|
215
|
+
barcode_tag="BC",
|
|
216
|
+
gzip_suffixes=(".gz", ".gzip"),
|
|
232
217
|
barcode_map=cfg.fastq_barcode_map,
|
|
233
218
|
add_read_group=True,
|
|
234
219
|
rg_sample_field=None,
|
|
235
220
|
progress=False,
|
|
236
|
-
auto_pair=cfg.fastq_auto_pairing
|
|
237
|
-
|
|
238
|
-
|
|
221
|
+
auto_pair=cfg.fastq_auto_pairing,
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
logger.info(f"Found the following barcodes in FASTQ inputs: {summary['barcodes']}")
|
|
239
225
|
|
|
240
226
|
# Set the input data path to the concatenated BAM.
|
|
241
227
|
cfg.input_data_path = output_bam
|
|
@@ -244,24 +230,24 @@ def load_adata(config_path):
|
|
|
244
230
|
pass
|
|
245
231
|
else:
|
|
246
232
|
pass
|
|
247
|
-
|
|
233
|
+
|
|
248
234
|
add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
|
|
249
235
|
|
|
250
236
|
# Determine if the input data needs to be basecalled
|
|
251
237
|
if cfg.input_type == "pod5":
|
|
252
|
-
|
|
238
|
+
logger.info(f"Detected pod5 inputs: {cfg.input_files}")
|
|
253
239
|
basecall = True
|
|
254
240
|
elif cfg.input_type in ["bam"]:
|
|
255
|
-
|
|
241
|
+
logger.info(f"Detected bam input: {cfg.input_files}")
|
|
256
242
|
basecall = False
|
|
257
243
|
else:
|
|
258
|
-
|
|
244
|
+
logger.info("Error, can not find input bam or pod5")
|
|
259
245
|
|
|
260
246
|
# Generate the base name of the unaligned bam without the .bam suffix
|
|
261
247
|
if basecall:
|
|
262
248
|
model_basename = Path(cfg.model).name
|
|
263
|
-
model_basename = str(model_basename).replace(
|
|
264
|
-
if cfg.smf_modality ==
|
|
249
|
+
model_basename = str(model_basename).replace(".", "_")
|
|
250
|
+
if cfg.smf_modality == "direct":
|
|
265
251
|
mod_string = "_".join(cfg.mod_list)
|
|
266
252
|
bam = cfg.output_directory / f"{model_basename}_{mod_string}_calls"
|
|
267
253
|
else:
|
|
@@ -272,7 +258,9 @@ def load_adata(config_path):
|
|
|
272
258
|
|
|
273
259
|
# Generate path names for the unaligned, aligned, as well as the aligned/sorted bam.
|
|
274
260
|
unaligned_output = bam.with_suffix(cfg.bam_suffix)
|
|
275
|
-
aligned_BAM =
|
|
261
|
+
aligned_BAM = (
|
|
262
|
+
cfg.output_directory / (bam.stem + "_aligned")
|
|
263
|
+
) # doing this allows specifying an input bam in a seperate directory as the aligned output bams
|
|
276
264
|
aligned_output = aligned_BAM.with_suffix(cfg.bam_suffix)
|
|
277
265
|
aligned_sorted_BAM = aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
|
|
278
266
|
aligned_sorted_output = aligned_sorted_BAM.with_suffix(cfg.bam_suffix)
|
|
@@ -283,34 +271,40 @@ def load_adata(config_path):
|
|
|
283
271
|
########################################################################################################################
|
|
284
272
|
|
|
285
273
|
################################### 2) FASTA Handling ###################################
|
|
286
|
-
from ..informatics.fasta_functions import generate_converted_FASTA, get_chromosome_lengths
|
|
287
274
|
|
|
288
275
|
try:
|
|
289
276
|
cfg.fasta = Path(cfg.fasta)
|
|
290
|
-
except:
|
|
291
|
-
|
|
277
|
+
except Exception:
|
|
278
|
+
logger.warning("Need to provide an input FASTA path to proceed with smftools load")
|
|
292
279
|
|
|
293
280
|
# If fasta_regions_of_interest bed is passed, subsample the input FASTA on regions of interest and use the subsampled FASTA.
|
|
294
|
-
if cfg.fasta_regions_of_interest and
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
output_FASTA =
|
|
298
|
-
|
|
299
|
-
|
|
281
|
+
if cfg.fasta_regions_of_interest and ".bed" in cfg.fasta_regions_of_interest:
|
|
282
|
+
fasta_stem = cfg.fasta.stem
|
|
283
|
+
bed_stem = Path(cfg.fasta_regions_of_interest).stem
|
|
284
|
+
output_FASTA = cfg.output_directory / f"{fasta_stem}_subsampled_by_{bed_stem}.fasta"
|
|
285
|
+
|
|
286
|
+
logger.info("Subsampling FASTA records using the provided BED file")
|
|
287
|
+
subsample_fasta_from_bed(
|
|
288
|
+
cfg.fasta, cfg.fasta_regions_of_interest, cfg.output_directory, output_FASTA
|
|
289
|
+
)
|
|
290
|
+
fasta = output_FASTA
|
|
300
291
|
else:
|
|
292
|
+
logger.info("Using the full FASTA file")
|
|
301
293
|
fasta = cfg.fasta
|
|
302
294
|
|
|
303
295
|
# For conversion style SMF, make a converted reference FASTA
|
|
304
|
-
if cfg.smf_modality ==
|
|
305
|
-
|
|
306
|
-
converted_FASTA_basename =
|
|
296
|
+
if cfg.smf_modality == "conversion":
|
|
297
|
+
fasta_stem = fasta.stem
|
|
298
|
+
converted_FASTA_basename = f"{fasta_stem}_converted.fasta"
|
|
307
299
|
converted_FASTA = cfg.output_directory / converted_FASTA_basename
|
|
308
|
-
|
|
309
|
-
|
|
300
|
+
|
|
301
|
+
if "converted.fa" in fasta.name:
|
|
302
|
+
logger.info(f"{fasta} is already converted. Using existing converted FASTA.")
|
|
310
303
|
converted_FASTA = fasta
|
|
311
304
|
elif converted_FASTA.exists():
|
|
312
|
-
|
|
305
|
+
logger.info(f"{converted_FASTA} already exists. Using existing converted FASTA.")
|
|
313
306
|
else:
|
|
307
|
+
logger.info(f"Converting FASTA base sequences")
|
|
314
308
|
generate_converted_FASTA(fasta, cfg.conversion_types, cfg.strands, converted_FASTA)
|
|
315
309
|
fasta = converted_FASTA
|
|
316
310
|
|
|
@@ -321,121 +315,164 @@ def load_adata(config_path):
|
|
|
321
315
|
########################################################################################################################
|
|
322
316
|
|
|
323
317
|
################################### 3) Basecalling ###################################
|
|
324
|
-
|
|
318
|
+
|
|
325
319
|
# 1) Basecall using dorado
|
|
326
|
-
if basecall and cfg.sequencer ==
|
|
320
|
+
if basecall and cfg.sequencer == "ont":
|
|
327
321
|
try:
|
|
328
322
|
cfg.model_dir = Path(cfg.model_dir)
|
|
329
|
-
except:
|
|
330
|
-
|
|
323
|
+
except Exception:
|
|
324
|
+
logger.warning(
|
|
325
|
+
"Need to provide a valid path to a dorado model directory to use dorado basecalling"
|
|
326
|
+
)
|
|
331
327
|
if aligned_sorted_output.exists():
|
|
332
|
-
|
|
328
|
+
logger.info(
|
|
329
|
+
f"{aligned_sorted_output} already exists. Using existing basecalled, aligned, sorted BAM."
|
|
330
|
+
)
|
|
333
331
|
elif unaligned_output.exists():
|
|
334
|
-
|
|
335
|
-
elif cfg.smf_modality !=
|
|
336
|
-
|
|
332
|
+
logger.info(f"{unaligned_output} already exists. Using existing basecalled BAM.")
|
|
333
|
+
elif cfg.smf_modality != "direct":
|
|
334
|
+
logger.info("Running canonical basecalling using dorado")
|
|
335
|
+
canoncall(
|
|
336
|
+
str(cfg.model_dir),
|
|
337
|
+
cfg.model,
|
|
338
|
+
str(cfg.input_data_path),
|
|
339
|
+
cfg.barcode_kit,
|
|
340
|
+
str(bam),
|
|
341
|
+
cfg.bam_suffix,
|
|
342
|
+
cfg.barcode_both_ends,
|
|
343
|
+
cfg.trim,
|
|
344
|
+
cfg.device,
|
|
345
|
+
)
|
|
337
346
|
else:
|
|
338
|
-
|
|
347
|
+
logger.info("Running modified basecalling using dorado")
|
|
348
|
+
modcall(
|
|
349
|
+
str(cfg.model_dir),
|
|
350
|
+
cfg.model,
|
|
351
|
+
str(cfg.input_data_path),
|
|
352
|
+
cfg.barcode_kit,
|
|
353
|
+
cfg.mod_list,
|
|
354
|
+
str(bam),
|
|
355
|
+
cfg.bam_suffix,
|
|
356
|
+
cfg.barcode_both_ends,
|
|
357
|
+
cfg.trim,
|
|
358
|
+
cfg.device,
|
|
359
|
+
)
|
|
339
360
|
elif basecall:
|
|
340
|
-
|
|
361
|
+
logger.error("Basecalling is currently only supported for ont sequencers and not pacbio.")
|
|
341
362
|
else:
|
|
342
363
|
pass
|
|
343
364
|
########################################################################################################################
|
|
344
365
|
|
|
345
366
|
################################### 4) Alignment and sorting #############################################
|
|
346
|
-
|
|
347
|
-
from ..informatics.bed_functions import aligned_BAM_to_bed
|
|
367
|
+
|
|
348
368
|
# 3) Align the BAM to the reference FASTA and sort the bam on positional coordinates. Also make an index and a bed file of mapped reads
|
|
349
369
|
if aligned_sorted_output.exists():
|
|
350
|
-
|
|
370
|
+
logger.debug(f"{aligned_sorted_output} already exists. Using existing aligned/sorted BAM.")
|
|
351
371
|
else:
|
|
352
|
-
|
|
372
|
+
logger.info(f"Aligning and sorting reads")
|
|
373
|
+
align_and_sort_BAM(fasta, unaligned_output, cfg)
|
|
353
374
|
# Deleted the unsorted aligned output
|
|
354
375
|
aligned_output.unlink()
|
|
355
376
|
|
|
356
377
|
if cfg.make_beds:
|
|
357
378
|
# Make beds and provide basic histograms
|
|
358
|
-
bed_dir = cfg.output_directory /
|
|
379
|
+
bed_dir = cfg.output_directory / "beds"
|
|
359
380
|
if bed_dir.is_dir():
|
|
360
|
-
|
|
381
|
+
logger.debug(
|
|
382
|
+
f"{bed_dir} already exists. Skipping BAM -> BED conversion for {aligned_sorted_output}"
|
|
383
|
+
)
|
|
361
384
|
else:
|
|
362
|
-
|
|
385
|
+
logger.info("Making bed files from the aligned and sorted BAM file")
|
|
386
|
+
aligned_BAM_to_bed(
|
|
387
|
+
aligned_sorted_output, cfg.output_directory, fasta, cfg.make_bigwigs, cfg.threads
|
|
388
|
+
)
|
|
363
389
|
########################################################################################################################
|
|
364
390
|
|
|
365
391
|
################################### 5) Demultiplexing ######################################################################
|
|
366
|
-
|
|
392
|
+
|
|
367
393
|
# 3) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory
|
|
368
394
|
if cfg.input_already_demuxed:
|
|
369
395
|
if cfg.split_path.is_dir():
|
|
370
|
-
|
|
396
|
+
logger.debug(f"{cfg.split_path} already exists. Using existing demultiplexed BAMs.")
|
|
371
397
|
|
|
372
398
|
all_bam_files = sorted(
|
|
373
|
-
p for p in cfg.split_path.iterdir()
|
|
374
|
-
if p.is_file()
|
|
375
|
-
and p.suffix == cfg.bam_suffix
|
|
399
|
+
p for p in cfg.split_path.iterdir() if p.is_file() and p.suffix == cfg.bam_suffix
|
|
376
400
|
)
|
|
377
401
|
unclassified_bams = [p for p in all_bam_files if "unclassified" in p.name]
|
|
378
402
|
bam_files = [p for p in all_bam_files if "unclassified" not in p.name]
|
|
379
403
|
|
|
380
404
|
else:
|
|
381
405
|
make_dirs([cfg.split_path])
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
406
|
+
logger.info("Demultiplexing samples into individual aligned/sorted BAM files")
|
|
407
|
+
all_bam_files = split_and_index_BAM(aligned_sorted_BAM, cfg.split_path, cfg.bam_suffix)
|
|
408
|
+
|
|
386
409
|
unclassified_bams = [p for p in all_bam_files if "unclassified" in p.name]
|
|
387
410
|
bam_files = sorted(p for p in all_bam_files if "unclassified" not in p.name)
|
|
388
411
|
|
|
389
412
|
se_bam_files = bam_files
|
|
390
413
|
bam_dir = cfg.split_path
|
|
391
|
-
|
|
414
|
+
|
|
392
415
|
else:
|
|
393
416
|
if single_barcoded_path.is_dir():
|
|
394
|
-
|
|
417
|
+
logger.debug(
|
|
418
|
+
f"{single_barcoded_path} already exists. Using existing single ended demultiplexed BAMs."
|
|
419
|
+
)
|
|
395
420
|
|
|
396
421
|
all_se_bam_files = sorted(
|
|
397
|
-
p
|
|
398
|
-
|
|
399
|
-
and p.suffix == cfg.bam_suffix
|
|
400
|
-
)
|
|
422
|
+
p
|
|
423
|
+
for p in single_barcoded_path.iterdir()
|
|
424
|
+
if p.is_file() and p.suffix == cfg.bam_suffix
|
|
425
|
+
)
|
|
401
426
|
unclassified_se_bams = [p for p in all_se_bam_files if "unclassified" in p.name]
|
|
402
427
|
se_bam_files = [p for p in all_se_bam_files if "unclassified" not in p.name]
|
|
403
428
|
else:
|
|
404
|
-
make_dirs([cfg.split_path, single_barcoded_path])
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
429
|
+
make_dirs([cfg.split_path, single_barcoded_path])
|
|
430
|
+
logger.info(
|
|
431
|
+
"Demultiplexing samples into individual aligned/sorted BAM files based on single end barcode status with Dorado"
|
|
432
|
+
)
|
|
433
|
+
all_se_bam_files = demux_and_index_BAM(
|
|
434
|
+
aligned_sorted_BAM,
|
|
435
|
+
single_barcoded_path,
|
|
436
|
+
cfg.bam_suffix,
|
|
437
|
+
cfg.barcode_kit,
|
|
438
|
+
False,
|
|
439
|
+
cfg.trim,
|
|
440
|
+
cfg.threads,
|
|
441
|
+
)
|
|
442
|
+
|
|
413
443
|
unclassified_se_bams = [p for p in all_se_bam_files if "unclassified" in p.name]
|
|
414
444
|
se_bam_files = [p for p in all_se_bam_files if "unclassified" not in p.name]
|
|
415
|
-
|
|
445
|
+
|
|
416
446
|
if double_barcoded_path.is_dir():
|
|
417
|
-
|
|
447
|
+
logger.debug(
|
|
448
|
+
f"{double_barcoded_path} already exists. Using existing double ended demultiplexed BAMs."
|
|
449
|
+
)
|
|
418
450
|
|
|
419
451
|
all_de_bam_files = sorted(
|
|
420
|
-
p
|
|
421
|
-
|
|
422
|
-
and p.suffix == cfg.bam_suffix
|
|
423
|
-
)
|
|
452
|
+
p
|
|
453
|
+
for p in double_barcoded_path.iterdir()
|
|
454
|
+
if p.is_file() and p.suffix == cfg.bam_suffix
|
|
455
|
+
)
|
|
424
456
|
unclassified_de_bams = [p for p in all_de_bam_files if "unclassified" in p.name]
|
|
425
457
|
de_bam_files = [p for p in all_de_bam_files if "unclassified" not in p.name]
|
|
426
|
-
else:
|
|
427
|
-
make_dirs([cfg.split_path, double_barcoded_path])
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
458
|
+
else:
|
|
459
|
+
make_dirs([cfg.split_path, double_barcoded_path])
|
|
460
|
+
logger.info(
|
|
461
|
+
"Demultiplexing samples into individual aligned/sorted BAM files based on double end barcode status with Dorado"
|
|
462
|
+
)
|
|
463
|
+
all_de_bam_files = demux_and_index_BAM(
|
|
464
|
+
aligned_sorted_BAM,
|
|
465
|
+
double_barcoded_path,
|
|
466
|
+
cfg.bam_suffix,
|
|
467
|
+
cfg.barcode_kit,
|
|
468
|
+
True,
|
|
469
|
+
cfg.trim,
|
|
470
|
+
cfg.threads,
|
|
471
|
+
)
|
|
472
|
+
|
|
436
473
|
unclassified_de_bams = [p for p in all_de_bam_files if "unclassified" in p.name]
|
|
437
474
|
de_bam_files = [p for p in all_de_bam_files if "unclassified" not in p.name]
|
|
438
|
-
|
|
475
|
+
|
|
439
476
|
bam_files = se_bam_files + de_bam_files
|
|
440
477
|
unclassified_bams = unclassified_se_bams + unclassified_de_bams
|
|
441
478
|
bam_dir = single_barcoded_path
|
|
@@ -444,134 +481,277 @@ def load_adata(config_path):
|
|
|
444
481
|
|
|
445
482
|
if cfg.make_beds:
|
|
446
483
|
# Make beds and provide basic histograms
|
|
447
|
-
bed_dir = cfg.split_path /
|
|
484
|
+
bed_dir = cfg.split_path / "beds"
|
|
448
485
|
if bed_dir.is_dir():
|
|
449
|
-
|
|
486
|
+
logger.debug(
|
|
487
|
+
f"{bed_dir} already exists. Skipping BAM -> BED conversion for demultiplexed bams"
|
|
488
|
+
)
|
|
450
489
|
else:
|
|
490
|
+
logger.info("Making BED files from BAM files for each sample")
|
|
451
491
|
for bam in bam_files:
|
|
452
492
|
aligned_BAM_to_bed(bam, cfg.split_path, fasta, cfg.make_bigwigs, cfg.threads)
|
|
453
493
|
########################################################################################################################
|
|
454
494
|
|
|
455
495
|
################################### 6) SAMTools based BAM QC ######################################################################
|
|
456
|
-
|
|
496
|
+
|
|
457
497
|
# 5) Samtools QC metrics on split BAM files
|
|
458
498
|
bam_qc_dir = cfg.split_path / "bam_qc"
|
|
459
499
|
if bam_qc_dir.is_dir():
|
|
460
|
-
|
|
500
|
+
logger.debug(f"{bam_qc_dir} already exists. Using existing BAM QC calculations.")
|
|
461
501
|
else:
|
|
462
502
|
make_dirs([bam_qc_dir])
|
|
503
|
+
logger.info("Performing BAM QC")
|
|
463
504
|
bam_qc(bam_files, bam_qc_dir, cfg.threads, modality=cfg.smf_modality)
|
|
464
|
-
########################################################################################################################
|
|
505
|
+
########################################################################################################################
|
|
465
506
|
|
|
466
507
|
################################### 7) AnnData loading ######################################################################
|
|
467
|
-
if cfg.smf_modality !=
|
|
508
|
+
if cfg.smf_modality != "direct":
|
|
468
509
|
from ..informatics.converted_BAM_to_adata import converted_BAM_to_adata
|
|
510
|
+
|
|
469
511
|
# 6) Take the converted BAM and load it into an adata object.
|
|
470
|
-
if cfg.smf_modality ==
|
|
512
|
+
if cfg.smf_modality == "deaminase":
|
|
471
513
|
deaminase_footprinting = True
|
|
472
514
|
else:
|
|
473
515
|
deaminase_footprinting = False
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
516
|
+
|
|
517
|
+
logger.info(f"Loading Anndata from BAM files for {cfg.smf_modality} footprinting")
|
|
518
|
+
raw_adata, raw_adata_path = converted_BAM_to_adata(
|
|
519
|
+
fasta,
|
|
520
|
+
bam_dir,
|
|
521
|
+
cfg.output_directory,
|
|
522
|
+
cfg.input_already_demuxed,
|
|
523
|
+
cfg.mapping_threshold,
|
|
524
|
+
cfg.experiment_name,
|
|
525
|
+
cfg.conversion_types,
|
|
526
|
+
cfg.bam_suffix,
|
|
527
|
+
cfg.device,
|
|
528
|
+
cfg.threads,
|
|
529
|
+
deaminase_footprinting,
|
|
530
|
+
delete_intermediates=cfg.delete_intermediate_hdfs,
|
|
531
|
+
double_barcoded_path=double_barcoded_path,
|
|
532
|
+
)
|
|
487
533
|
else:
|
|
488
534
|
if mod_bed_dir.is_dir():
|
|
489
|
-
|
|
535
|
+
logger.debug(f"{mod_bed_dir} already exists, skipping making modbeds")
|
|
490
536
|
else:
|
|
491
|
-
from ..informatics.modkit_functions import
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
537
|
+
from ..informatics.modkit_functions import make_modbed, modQC
|
|
538
|
+
|
|
539
|
+
make_dirs([mod_bed_dir])
|
|
540
|
+
|
|
541
|
+
logger.info("Performing modQC for direct footprinting samples")
|
|
542
|
+
|
|
543
|
+
modQC(aligned_sorted_output, cfg.thresholds) # get QC metrics for mod calls
|
|
544
|
+
|
|
545
|
+
logger.info("Making modified BED files for direct footprinting samples")
|
|
546
|
+
|
|
547
|
+
make_modbed(
|
|
548
|
+
aligned_sorted_output, cfg.thresholds, mod_bed_dir
|
|
549
|
+
) # Generate bed files of position methylation summaries for every sample
|
|
550
|
+
|
|
501
551
|
from ..informatics.modkit_functions import extract_mods
|
|
552
|
+
|
|
502
553
|
make_dirs([mod_tsv_dir])
|
|
503
554
|
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
555
|
+
logger.info(
|
|
556
|
+
"Extracting single read modification states into TSVs for direct footprinting samples"
|
|
557
|
+
)
|
|
558
|
+
|
|
559
|
+
extract_mods(
|
|
560
|
+
cfg.thresholds,
|
|
561
|
+
mod_tsv_dir,
|
|
562
|
+
bam_dir,
|
|
563
|
+
cfg.bam_suffix,
|
|
564
|
+
skip_unclassified=cfg.skip_unclassified,
|
|
565
|
+
modkit_summary=False,
|
|
566
|
+
threads=cfg.threads,
|
|
567
|
+
) # Extract methylations calls for split BAM files into split TSV files
|
|
568
|
+
|
|
512
569
|
from ..informatics.modkit_extract_to_adata import modkit_extract_to_adata
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
570
|
+
|
|
571
|
+
logger.info("Making Anndata for direct modification detection SMF samples")
|
|
572
|
+
|
|
573
|
+
# 6 Load the modification data from TSVs into an adata object
|
|
574
|
+
raw_adata, raw_adata_path = modkit_extract_to_adata(
|
|
575
|
+
fasta,
|
|
576
|
+
bam_dir,
|
|
577
|
+
cfg.output_directory,
|
|
578
|
+
cfg.input_already_demuxed,
|
|
579
|
+
cfg.mapping_threshold,
|
|
580
|
+
cfg.experiment_name,
|
|
581
|
+
mods,
|
|
582
|
+
cfg.batch_size,
|
|
583
|
+
mod_tsv_dir,
|
|
584
|
+
cfg.delete_batch_hdfs,
|
|
585
|
+
cfg.threads,
|
|
586
|
+
double_barcoded_path,
|
|
587
|
+
)
|
|
526
588
|
if cfg.delete_intermediate_tsvs:
|
|
527
589
|
delete_tsvs(mod_tsv_dir)
|
|
528
590
|
|
|
529
|
-
raw_adata.obs[
|
|
530
|
-
raw_adata.obs[
|
|
591
|
+
raw_adata.obs["Experiment_name"] = [cfg.experiment_name] * raw_adata.shape[0]
|
|
592
|
+
raw_adata.obs["Experiment_name_and_barcode"] = (
|
|
593
|
+
raw_adata.obs["Experiment_name"].astype(str) + "_" + raw_adata.obs["Barcode"].astype(str)
|
|
594
|
+
)
|
|
531
595
|
|
|
532
596
|
########################################################################################################################
|
|
533
597
|
|
|
534
598
|
############################################### Add basic read length, read quality, mapping quality stats ###############################################
|
|
535
|
-
from ..informatics.h5ad_functions import add_read_length_and_mapping_qc
|
|
536
|
-
from ..informatics.bam_functions import extract_read_features_from_bam
|
|
537
|
-
add_read_length_and_mapping_qc(raw_adata, se_bam_files,
|
|
538
|
-
extract_read_features_from_bam_callable=extract_read_features_from_bam,
|
|
539
|
-
bypass=cfg.bypass_add_read_length_and_mapping_qc,
|
|
540
|
-
force_redo=cfg.force_redo_add_read_length_and_mapping_qc)
|
|
541
599
|
|
|
542
|
-
|
|
600
|
+
logger.info("Adding read length, mapping quality, and modification signal to Anndata")
|
|
601
|
+
add_read_length_and_mapping_qc(
|
|
602
|
+
raw_adata,
|
|
603
|
+
se_bam_files,
|
|
604
|
+
extract_read_features_from_bam_callable=extract_read_features_from_bam,
|
|
605
|
+
bypass=cfg.bypass_add_read_length_and_mapping_qc,
|
|
606
|
+
force_redo=cfg.force_redo_add_read_length_and_mapping_qc,
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
raw_adata.obs["Raw_modification_signal"] = np.nansum(raw_adata.X, axis=1)
|
|
610
|
+
########################################################################################################################
|
|
611
|
+
|
|
612
|
+
############################################### if input data type was pod5, append the pod5 file origin to each read ###############################################
|
|
613
|
+
from ..informatics.h5ad_functions import annotate_pod5_origin
|
|
614
|
+
|
|
615
|
+
if cfg.input_type == "pod5":
|
|
616
|
+
logger.info("Adding the POD5 origin file to each read into Anndata")
|
|
617
|
+
annotate_pod5_origin(
|
|
618
|
+
raw_adata,
|
|
619
|
+
cfg.input_data_path,
|
|
620
|
+
n_jobs=cfg.threads,
|
|
621
|
+
csv_path=output_directory / "read_to_pod5_origin_mapping.csv",
|
|
622
|
+
)
|
|
543
623
|
########################################################################################################################
|
|
544
624
|
|
|
545
625
|
############################################### Save final adata ###############################################
|
|
546
|
-
|
|
547
|
-
|
|
626
|
+
logger.info(f"Saving AnnData to {raw_adata_path}")
|
|
627
|
+
record_smftools_metadata(
|
|
628
|
+
raw_adata,
|
|
629
|
+
step_name="load",
|
|
630
|
+
cfg=cfg,
|
|
631
|
+
config_path=config_path,
|
|
632
|
+
output_path=raw_adata_path,
|
|
633
|
+
)
|
|
634
|
+
write_gz_h5ad(raw_adata, raw_adata_path)
|
|
548
635
|
########################################################################################################################
|
|
549
636
|
|
|
550
637
|
############################################### MultiQC HTML Report ###############################################
|
|
551
|
-
|
|
638
|
+
|
|
552
639
|
# multiqc ###
|
|
553
640
|
mqc_dir = cfg.split_path / "multiqc"
|
|
554
641
|
if mqc_dir.is_dir():
|
|
555
|
-
|
|
642
|
+
logger.debug(f"{mqc_dir} already exists, skipping multiqc")
|
|
556
643
|
else:
|
|
644
|
+
logger.info("Running multiqc")
|
|
557
645
|
run_multiqc(cfg.split_path, mqc_dir)
|
|
558
646
|
########################################################################################################################
|
|
559
647
|
|
|
560
648
|
############################################### delete intermediate BAM files ###############################################
|
|
561
649
|
if cfg.delete_intermediate_bams:
|
|
650
|
+
logger.info("Deleting intermediate BAM files")
|
|
562
651
|
# delete aligned and sorted bam
|
|
563
652
|
aligned_sorted_output.unlink()
|
|
564
|
-
bai = aligned_sorted_output.parent / (aligned_sorted_output.name +
|
|
653
|
+
bai = aligned_sorted_output.parent / (aligned_sorted_output.name + ".bai")
|
|
565
654
|
bai.unlink()
|
|
566
655
|
# delete the demultiplexed bams. Keep the demultiplexing summary files and directories to faciliate demultiplexing in the future with these files
|
|
567
656
|
for bam in bam_files:
|
|
568
|
-
bai = bam.parent / (bam.name +
|
|
657
|
+
bai = bam.parent / (bam.name + ".bai")
|
|
569
658
|
bam.unlink()
|
|
570
659
|
bai.unlink()
|
|
571
660
|
for bam in unclassified_bams:
|
|
572
|
-
bai = bam.parent / (bam.name +
|
|
661
|
+
bai = bam.parent / (bam.name + ".bai")
|
|
573
662
|
bam.unlink()
|
|
574
|
-
bai.unlink()
|
|
663
|
+
bai.unlink()
|
|
664
|
+
logger.info("Finished deleting intermediate BAM files")
|
|
575
665
|
########################################################################################################################
|
|
576
666
|
|
|
577
|
-
return raw_adata, raw_adata_path, cfg
|
|
667
|
+
return raw_adata, raw_adata_path, cfg
|
|
668
|
+
|
|
669
|
+
|
|
670
|
+
def load_adata(config_path: str):
|
|
671
|
+
"""
|
|
672
|
+
CLI-facing wrapper for the load pipeline.
|
|
673
|
+
|
|
674
|
+
- Reads config CSV into ExperimentConfig
|
|
675
|
+
- Computes canonical paths for all downstream AnnData stages
|
|
676
|
+
- Registers those in the summary CSV
|
|
677
|
+
- Applies stage-skipping logic (hmm > spatial > pp_dedup > pp > raw)
|
|
678
|
+
- If needed, calls the core pipeline to actually build the raw AnnData
|
|
679
|
+
|
|
680
|
+
Returns
|
|
681
|
+
-------
|
|
682
|
+
adata : anndata.AnnData | None
|
|
683
|
+
Newly created AnnData object, or None if we skipped because a later-stage
|
|
684
|
+
AnnData already exists.
|
|
685
|
+
adata_path : pathlib.Path
|
|
686
|
+
Path to the "current" AnnData that should be used downstream.
|
|
687
|
+
cfg : ExperimentConfig
|
|
688
|
+
Config object for downstream steps.
|
|
689
|
+
"""
|
|
690
|
+
from datetime import datetime
|
|
691
|
+
from importlib import resources
|
|
692
|
+
|
|
693
|
+
from ..config import ExperimentConfig, LoadExperimentConfig
|
|
694
|
+
from ..readwrite import add_or_update_column_in_csv, make_dirs
|
|
695
|
+
from .helpers import get_adata_paths
|
|
696
|
+
|
|
697
|
+
date_str = datetime.today().strftime("%y%m%d")
|
|
698
|
+
|
|
699
|
+
# -----------------------------
|
|
700
|
+
# 1) Load config into cfg
|
|
701
|
+
# -----------------------------
|
|
702
|
+
loader = LoadExperimentConfig(config_path)
|
|
703
|
+
defaults_dir = resources.files("smftools").joinpath("config")
|
|
704
|
+
cfg, report = ExperimentConfig.from_var_dict(
|
|
705
|
+
loader.var_dict, date_str=date_str, defaults_dir=defaults_dir
|
|
706
|
+
)
|
|
707
|
+
|
|
708
|
+
# Ensure base output dir
|
|
709
|
+
make_dirs([cfg.output_directory])
|
|
710
|
+
|
|
711
|
+
# -----------------------------
|
|
712
|
+
# 2) Compute and register paths
|
|
713
|
+
# -----------------------------
|
|
714
|
+
paths = get_adata_paths(cfg)
|
|
715
|
+
|
|
716
|
+
# experiment-level metadata in summary CSV
|
|
717
|
+
add_or_update_column_in_csv(cfg.summary_file, "experiment_name", cfg.experiment_name)
|
|
718
|
+
add_or_update_column_in_csv(cfg.summary_file, "config_path", config_path)
|
|
719
|
+
add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
|
|
720
|
+
add_or_update_column_in_csv(cfg.summary_file, "input_files", [cfg.input_files])
|
|
721
|
+
|
|
722
|
+
# AnnData stage paths
|
|
723
|
+
add_or_update_column_in_csv(cfg.summary_file, "load_adata", paths.raw)
|
|
724
|
+
add_or_update_column_in_csv(cfg.summary_file, "pp_adata", paths.pp)
|
|
725
|
+
add_or_update_column_in_csv(cfg.summary_file, "pp_dedup_adata", paths.pp_dedup)
|
|
726
|
+
add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", paths.spatial)
|
|
727
|
+
add_or_update_column_in_csv(cfg.summary_file, "hmm_adata", paths.hmm)
|
|
728
|
+
|
|
729
|
+
# -----------------------------
|
|
730
|
+
# 3) Stage skipping logic
|
|
731
|
+
# -----------------------------
|
|
732
|
+
if not getattr(cfg, "force_redo_load_adata", False):
|
|
733
|
+
if paths.hmm.exists():
|
|
734
|
+
logger.debug(f"HMM AnnData already exists: {paths.hmm}\nSkipping smftools load")
|
|
735
|
+
return None, paths.hmm, cfg
|
|
736
|
+
if paths.spatial.exists():
|
|
737
|
+
logger.debug(f"Spatial AnnData already exists: {paths.spatial}\nSkipping smftools load")
|
|
738
|
+
return None, paths.spatial, cfg
|
|
739
|
+
if paths.pp_dedup.exists():
|
|
740
|
+
logger.debug(
|
|
741
|
+
f"Preprocessed deduplicated AnnData already exists: {paths.pp_dedup}\n"
|
|
742
|
+
f"Skipping smftools load"
|
|
743
|
+
)
|
|
744
|
+
return None, paths.pp_dedup, cfg
|
|
745
|
+
if paths.pp.exists():
|
|
746
|
+
logger.debug(f"Preprocessed AnnData already exists: {paths.pp}\nSkipping smftools load")
|
|
747
|
+
return None, paths.pp, cfg
|
|
748
|
+
if paths.raw.exists():
|
|
749
|
+
logger.debug(
|
|
750
|
+
f"Raw AnnData from smftools load already exists: {paths.raw}\nSkipping smftools load"
|
|
751
|
+
)
|
|
752
|
+
return None, paths.raw, cfg
|
|
753
|
+
|
|
754
|
+
# If we get here, we actually want to run the full load pipeline
|
|
755
|
+
adata, adata_path, cfg = load_adata_core(cfg, paths, config_path=config_path)
|
|
756
|
+
|
|
757
|
+
return adata, adata_path, cfg
|