smftools 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +6 -8
- smftools/_settings.py +4 -6
- smftools/_version.py +1 -1
- smftools/cli/helpers.py +7 -1
- smftools/cli/hmm_adata.py +902 -244
- smftools/cli/load_adata.py +318 -198
- smftools/cli/preprocess_adata.py +285 -171
- smftools/cli/spatial_adata.py +137 -53
- smftools/cli_entry.py +94 -178
- smftools/config/__init__.py +1 -1
- smftools/config/conversion.yaml +5 -1
- smftools/config/deaminase.yaml +1 -1
- smftools/config/default.yaml +22 -17
- smftools/config/direct.yaml +8 -3
- smftools/config/discover_input_files.py +19 -5
- smftools/config/experiment_config.py +505 -276
- smftools/constants.py +37 -0
- smftools/datasets/__init__.py +2 -8
- smftools/datasets/datasets.py +32 -18
- smftools/hmm/HMM.py +2125 -1426
- smftools/hmm/__init__.py +2 -3
- smftools/hmm/archived/call_hmm_peaks.py +16 -1
- smftools/hmm/call_hmm_peaks.py +173 -193
- smftools/hmm/display_hmm.py +19 -6
- smftools/hmm/hmm_readwrite.py +13 -4
- smftools/hmm/nucleosome_hmm_refinement.py +102 -14
- smftools/informatics/__init__.py +30 -7
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
- smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
- smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
- smftools/informatics/archived/print_bam_query_seq.py +7 -1
- smftools/informatics/bam_functions.py +379 -156
- smftools/informatics/basecalling.py +51 -9
- smftools/informatics/bed_functions.py +90 -57
- smftools/informatics/binarize_converted_base_identities.py +18 -7
- smftools/informatics/complement_base_list.py +7 -6
- smftools/informatics/converted_BAM_to_adata.py +265 -122
- smftools/informatics/fasta_functions.py +161 -83
- smftools/informatics/h5ad_functions.py +195 -29
- smftools/informatics/modkit_extract_to_adata.py +609 -270
- smftools/informatics/modkit_functions.py +85 -44
- smftools/informatics/ohe.py +44 -21
- smftools/informatics/pod5_functions.py +112 -73
- smftools/informatics/run_multiqc.py +20 -14
- smftools/logging_utils.py +51 -0
- smftools/machine_learning/__init__.py +2 -7
- smftools/machine_learning/data/anndata_data_module.py +143 -50
- smftools/machine_learning/data/preprocessing.py +2 -1
- smftools/machine_learning/evaluation/__init__.py +1 -1
- smftools/machine_learning/evaluation/eval_utils.py +11 -14
- smftools/machine_learning/evaluation/evaluators.py +46 -33
- smftools/machine_learning/inference/__init__.py +1 -1
- smftools/machine_learning/inference/inference_utils.py +7 -4
- smftools/machine_learning/inference/lightning_inference.py +9 -13
- smftools/machine_learning/inference/sklearn_inference.py +6 -8
- smftools/machine_learning/inference/sliding_window_inference.py +35 -25
- smftools/machine_learning/models/__init__.py +10 -5
- smftools/machine_learning/models/base.py +28 -42
- smftools/machine_learning/models/cnn.py +15 -11
- smftools/machine_learning/models/lightning_base.py +71 -40
- smftools/machine_learning/models/mlp.py +13 -4
- smftools/machine_learning/models/positional.py +3 -2
- smftools/machine_learning/models/rnn.py +3 -2
- smftools/machine_learning/models/sklearn_models.py +39 -22
- smftools/machine_learning/models/transformer.py +68 -53
- smftools/machine_learning/models/wrappers.py +2 -1
- smftools/machine_learning/training/__init__.py +2 -2
- smftools/machine_learning/training/train_lightning_model.py +29 -20
- smftools/machine_learning/training/train_sklearn_model.py +9 -15
- smftools/machine_learning/utils/__init__.py +1 -1
- smftools/machine_learning/utils/device.py +7 -4
- smftools/machine_learning/utils/grl.py +3 -1
- smftools/metadata.py +443 -0
- smftools/plotting/__init__.py +19 -5
- smftools/plotting/autocorrelation_plotting.py +145 -44
- smftools/plotting/classifiers.py +162 -72
- smftools/plotting/general_plotting.py +347 -168
- smftools/plotting/hmm_plotting.py +42 -13
- smftools/plotting/position_stats.py +145 -85
- smftools/plotting/qc_plotting.py +20 -12
- smftools/preprocessing/__init__.py +8 -8
- smftools/preprocessing/append_base_context.py +105 -79
- smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
- smftools/preprocessing/{archives → archived}/calculate_complexity.py +3 -1
- smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
- smftools/preprocessing/binarize.py +21 -4
- smftools/preprocessing/binarize_on_Youden.py +127 -31
- smftools/preprocessing/binary_layers_to_ohe.py +17 -11
- smftools/preprocessing/calculate_complexity_II.py +86 -59
- smftools/preprocessing/calculate_consensus.py +28 -19
- smftools/preprocessing/calculate_coverage.py +44 -22
- smftools/preprocessing/calculate_pairwise_differences.py +2 -1
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
- smftools/preprocessing/calculate_position_Youden.py +103 -55
- smftools/preprocessing/calculate_read_length_stats.py +52 -23
- smftools/preprocessing/calculate_read_modification_stats.py +91 -57
- smftools/preprocessing/clean_NaN.py +38 -28
- smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +70 -37
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
- smftools/preprocessing/flag_duplicate_reads.py +688 -271
- smftools/preprocessing/invert_adata.py +26 -11
- smftools/preprocessing/load_sample_sheet.py +40 -22
- smftools/preprocessing/make_dirs.py +8 -3
- smftools/preprocessing/min_non_diagonal.py +2 -1
- smftools/preprocessing/recipes.py +56 -23
- smftools/preprocessing/reindex_references_adata.py +93 -27
- smftools/preprocessing/subsample_adata.py +33 -16
- smftools/readwrite.py +264 -109
- smftools/schema/__init__.py +11 -0
- smftools/schema/anndata_schema_v1.yaml +227 -0
- smftools/tools/__init__.py +3 -4
- smftools/tools/archived/classifiers.py +163 -0
- smftools/tools/archived/subset_adata_v1.py +10 -1
- smftools/tools/archived/subset_adata_v2.py +12 -1
- smftools/tools/calculate_umap.py +54 -15
- smftools/tools/cluster_adata_on_methylation.py +115 -46
- smftools/tools/general_tools.py +70 -25
- smftools/tools/position_stats.py +229 -98
- smftools/tools/read_stats.py +50 -29
- smftools/tools/spatial_autocorrelation.py +365 -192
- smftools/tools/subset_adata.py +23 -21
- {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/METADATA +15 -43
- smftools-0.2.5.dist-info/RECORD +181 -0
- smftools-0.2.4.dist-info/RECORD +0 -176
- /smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +0 -0
- /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
- /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
- {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
- {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
smftools/cli/load_adata.py
CHANGED
|
@@ -1,13 +1,19 @@
|
|
|
1
1
|
import shutil
|
|
2
2
|
from pathlib import Path
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import Iterable, Union
|
|
4
|
+
|
|
5
|
+
from smftools.logging_utils import get_logger
|
|
4
6
|
|
|
5
7
|
from .helpers import AdataPaths
|
|
6
8
|
|
|
9
|
+
logger = get_logger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
7
12
|
def check_executable_exists(cmd: str) -> bool:
|
|
8
13
|
"""Return True if a command-line executable is available in PATH."""
|
|
9
14
|
return shutil.which(cmd) is not None
|
|
10
15
|
|
|
16
|
+
|
|
11
17
|
def delete_tsvs(
|
|
12
18
|
tsv_dir: Union[str, Path, Iterable[str], None],
|
|
13
19
|
*,
|
|
@@ -27,48 +33,50 @@ def delete_tsvs(
|
|
|
27
33
|
verbose : bool
|
|
28
34
|
Print progress / warnings.
|
|
29
35
|
"""
|
|
36
|
+
|
|
30
37
|
# Helper: remove a single file path (Path-like or string)
|
|
31
38
|
def _maybe_unlink(p: Path):
|
|
32
39
|
if not p.exists():
|
|
33
40
|
if verbose:
|
|
34
|
-
|
|
41
|
+
logger.info(f"[skip] not found: {p}")
|
|
35
42
|
return
|
|
36
43
|
if not p.is_file():
|
|
37
44
|
if verbose:
|
|
38
|
-
|
|
45
|
+
logger.info(f"[skip] not a file: {p}")
|
|
39
46
|
return
|
|
40
47
|
if dry_run:
|
|
41
|
-
|
|
48
|
+
logger.info(f"[dry-run] would remove file: {p}")
|
|
42
49
|
return
|
|
43
50
|
try:
|
|
44
51
|
p.unlink()
|
|
45
52
|
if verbose:
|
|
46
|
-
|
|
53
|
+
logger.info(f"Removed file: {p}")
|
|
47
54
|
except Exception as e:
|
|
48
|
-
|
|
55
|
+
logger.warning(f"Failed to remove file {p}: {e}")
|
|
49
56
|
|
|
50
57
|
# Remove tmp_dir recursively (if provided)
|
|
51
58
|
if tsv_dir is not None:
|
|
52
59
|
td = Path(tsv_dir)
|
|
53
60
|
if not td.exists():
|
|
54
61
|
if verbose:
|
|
55
|
-
|
|
62
|
+
logger.info(f"[skip] tsv_dir not found: {td}")
|
|
56
63
|
else:
|
|
57
64
|
if not td.is_dir():
|
|
58
65
|
if verbose:
|
|
59
|
-
|
|
66
|
+
logger.info(f"[skip] tsv_dir is not a directory: {td}")
|
|
60
67
|
else:
|
|
61
68
|
if dry_run:
|
|
62
|
-
|
|
69
|
+
logger.info(f"[dry-run] would remove directory tree: {td}")
|
|
63
70
|
else:
|
|
64
71
|
try:
|
|
65
72
|
shutil.rmtree(td)
|
|
66
73
|
if verbose:
|
|
67
|
-
|
|
74
|
+
logger.info(f"Removed directory tree: {td}")
|
|
68
75
|
except Exception as e:
|
|
69
|
-
|
|
76
|
+
logger.warning(f"[error] failed to remove tmp dir {td}: {e}")
|
|
77
|
+
|
|
70
78
|
|
|
71
|
-
def load_adata_core(cfg, paths: AdataPaths):
|
|
79
|
+
def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
72
80
|
"""
|
|
73
81
|
Core load pipeline.
|
|
74
82
|
|
|
@@ -97,28 +105,34 @@ def load_adata_core(cfg, paths: AdataPaths):
|
|
|
97
105
|
cfg : ExperimentConfig
|
|
98
106
|
(Same object, possibly with some fields updated, e.g. fasta path.)
|
|
99
107
|
"""
|
|
100
|
-
import os
|
|
101
108
|
from pathlib import Path
|
|
102
109
|
|
|
103
110
|
import numpy as np
|
|
104
|
-
import pandas as pd
|
|
105
|
-
import anndata as ad
|
|
106
|
-
import scanpy as sc
|
|
107
111
|
|
|
108
|
-
from .
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
112
|
+
from ..informatics.bam_functions import (
|
|
113
|
+
align_and_sort_BAM,
|
|
114
|
+
bam_qc,
|
|
115
|
+
concatenate_fastqs_to_bam,
|
|
116
|
+
demux_and_index_BAM,
|
|
117
|
+
extract_read_features_from_bam,
|
|
118
|
+
split_and_index_BAM,
|
|
119
|
+
)
|
|
120
|
+
from ..informatics.basecalling import canoncall, modcall
|
|
113
121
|
from ..informatics.bed_functions import aligned_BAM_to_bed
|
|
114
|
-
from ..informatics.pod5_functions import fast5_to_pod5
|
|
115
|
-
from ..informatics.fasta_functions import subsample_fasta_from_bed, generate_converted_FASTA, get_chromosome_lengths
|
|
116
|
-
from ..informatics.basecalling import modcall, canoncall
|
|
117
|
-
from ..informatics.modkit_functions import modQC, make_modbed, extract_mods
|
|
118
|
-
from ..informatics.modkit_extract_to_adata import modkit_extract_to_adata
|
|
119
122
|
from ..informatics.converted_BAM_to_adata import converted_BAM_to_adata
|
|
123
|
+
from ..informatics.fasta_functions import (
|
|
124
|
+
generate_converted_FASTA,
|
|
125
|
+
get_chromosome_lengths,
|
|
126
|
+
subsample_fasta_from_bed,
|
|
127
|
+
)
|
|
120
128
|
from ..informatics.h5ad_functions import add_read_length_and_mapping_qc
|
|
129
|
+
from ..informatics.modkit_extract_to_adata import modkit_extract_to_adata
|
|
130
|
+
from ..informatics.modkit_functions import extract_mods, make_modbed, modQC
|
|
131
|
+
from ..informatics.pod5_functions import fast5_to_pod5
|
|
121
132
|
from ..informatics.run_multiqc import run_multiqc
|
|
133
|
+
from ..metadata import record_smftools_metadata
|
|
134
|
+
from ..readwrite import add_or_update_column_in_csv, make_dirs
|
|
135
|
+
from .helpers import write_gz_h5ad
|
|
122
136
|
|
|
123
137
|
################################### 1) General params and input organization ###################################
|
|
124
138
|
output_directory = Path(cfg.output_directory)
|
|
@@ -169,19 +183,20 @@ def load_adata_core(cfg, paths: AdataPaths):
|
|
|
169
183
|
if cfg.aligner == "minimap2":
|
|
170
184
|
if not check_executable_exists("minimap2"):
|
|
171
185
|
raise RuntimeError(
|
|
172
|
-
"Error: 'minimap2' is not installed or not in PATH. "
|
|
173
|
-
"Install minimap2"
|
|
186
|
+
"Error: 'minimap2' is not installed or not in PATH. Install minimap2"
|
|
174
187
|
)
|
|
175
188
|
|
|
176
189
|
# # Detect the input filetypes
|
|
177
190
|
# If the input files are fast5 files, convert the files to a pod5 file before proceeding.
|
|
178
191
|
if cfg.input_type == "fast5":
|
|
179
192
|
# take the input directory of fast5 files and write out a single pod5 file into the output directory.
|
|
180
|
-
output_pod5 = cfg.output_directory /
|
|
193
|
+
output_pod5 = cfg.output_directory / "FAST5s_to_POD5.pod5"
|
|
181
194
|
if output_pod5.exists():
|
|
182
195
|
pass
|
|
183
196
|
else:
|
|
184
|
-
|
|
197
|
+
logger.info(
|
|
198
|
+
f"Input directory contains fast5 files, converting them and concatenating into a single pod5 file in the {output_pod5}"
|
|
199
|
+
)
|
|
185
200
|
fast5_to_pod5(cfg.input_data_path, output_pod5)
|
|
186
201
|
# Reassign the pod5_dir variable to point to the new pod5 file.
|
|
187
202
|
cfg.input_data_path = output_pod5
|
|
@@ -189,22 +204,24 @@ def load_adata_core(cfg, paths: AdataPaths):
|
|
|
189
204
|
# If the input is a fastq or a directory of fastqs, concatenate them into an unaligned BAM and save the barcode
|
|
190
205
|
elif cfg.input_type == "fastq":
|
|
191
206
|
# Output file for FASTQ concatenation.
|
|
192
|
-
output_bam = cfg.output_directory /
|
|
207
|
+
output_bam = cfg.output_directory / "canonical_basecalls.bam"
|
|
193
208
|
if output_bam.exists():
|
|
194
|
-
|
|
209
|
+
logger.debug("Output BAM already exists")
|
|
195
210
|
else:
|
|
211
|
+
logger.info("Concatenating FASTQ files into a single BAM file")
|
|
196
212
|
summary = concatenate_fastqs_to_bam(
|
|
197
213
|
cfg.input_files,
|
|
198
214
|
output_bam,
|
|
199
|
-
barcode_tag=
|
|
200
|
-
gzip_suffixes=(
|
|
215
|
+
barcode_tag="BC",
|
|
216
|
+
gzip_suffixes=(".gz", ".gzip"),
|
|
201
217
|
barcode_map=cfg.fastq_barcode_map,
|
|
202
218
|
add_read_group=True,
|
|
203
219
|
rg_sample_field=None,
|
|
204
220
|
progress=False,
|
|
205
|
-
auto_pair=cfg.fastq_auto_pairing
|
|
206
|
-
|
|
207
|
-
|
|
221
|
+
auto_pair=cfg.fastq_auto_pairing,
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
logger.info(f"Found the following barcodes in FASTQ inputs: {summary['barcodes']}")
|
|
208
225
|
|
|
209
226
|
# Set the input data path to the concatenated BAM.
|
|
210
227
|
cfg.input_data_path = output_bam
|
|
@@ -213,24 +230,24 @@ def load_adata_core(cfg, paths: AdataPaths):
|
|
|
213
230
|
pass
|
|
214
231
|
else:
|
|
215
232
|
pass
|
|
216
|
-
|
|
233
|
+
|
|
217
234
|
add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
|
|
218
235
|
|
|
219
236
|
# Determine if the input data needs to be basecalled
|
|
220
237
|
if cfg.input_type == "pod5":
|
|
221
|
-
|
|
238
|
+
logger.info(f"Detected pod5 inputs: {cfg.input_files}")
|
|
222
239
|
basecall = True
|
|
223
240
|
elif cfg.input_type in ["bam"]:
|
|
224
|
-
|
|
241
|
+
logger.info(f"Detected bam input: {cfg.input_files}")
|
|
225
242
|
basecall = False
|
|
226
243
|
else:
|
|
227
|
-
|
|
244
|
+
logger.info("Error, can not find input bam or pod5")
|
|
228
245
|
|
|
229
246
|
# Generate the base name of the unaligned bam without the .bam suffix
|
|
230
247
|
if basecall:
|
|
231
248
|
model_basename = Path(cfg.model).name
|
|
232
|
-
model_basename = str(model_basename).replace(
|
|
233
|
-
if cfg.smf_modality ==
|
|
249
|
+
model_basename = str(model_basename).replace(".", "_")
|
|
250
|
+
if cfg.smf_modality == "direct":
|
|
234
251
|
mod_string = "_".join(cfg.mod_list)
|
|
235
252
|
bam = cfg.output_directory / f"{model_basename}_{mod_string}_calls"
|
|
236
253
|
else:
|
|
@@ -241,7 +258,9 @@ def load_adata_core(cfg, paths: AdataPaths):
|
|
|
241
258
|
|
|
242
259
|
# Generate path names for the unaligned, aligned, as well as the aligned/sorted bam.
|
|
243
260
|
unaligned_output = bam.with_suffix(cfg.bam_suffix)
|
|
244
|
-
aligned_BAM =
|
|
261
|
+
aligned_BAM = (
|
|
262
|
+
cfg.output_directory / (bam.stem + "_aligned")
|
|
263
|
+
) # doing this allows specifying an input bam in a seperate directory as the aligned output bams
|
|
245
264
|
aligned_output = aligned_BAM.with_suffix(cfg.bam_suffix)
|
|
246
265
|
aligned_sorted_BAM = aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
|
|
247
266
|
aligned_sorted_output = aligned_sorted_BAM.with_suffix(cfg.bam_suffix)
|
|
@@ -252,34 +271,40 @@ def load_adata_core(cfg, paths: AdataPaths):
|
|
|
252
271
|
########################################################################################################################
|
|
253
272
|
|
|
254
273
|
################################### 2) FASTA Handling ###################################
|
|
255
|
-
from ..informatics.fasta_functions import generate_converted_FASTA, get_chromosome_lengths
|
|
256
274
|
|
|
257
275
|
try:
|
|
258
276
|
cfg.fasta = Path(cfg.fasta)
|
|
259
|
-
except:
|
|
260
|
-
|
|
277
|
+
except Exception:
|
|
278
|
+
logger.warning("Need to provide an input FASTA path to proceed with smftools load")
|
|
261
279
|
|
|
262
280
|
# If fasta_regions_of_interest bed is passed, subsample the input FASTA on regions of interest and use the subsampled FASTA.
|
|
263
|
-
if cfg.fasta_regions_of_interest and
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
output_FASTA =
|
|
267
|
-
|
|
268
|
-
|
|
281
|
+
if cfg.fasta_regions_of_interest and ".bed" in cfg.fasta_regions_of_interest:
|
|
282
|
+
fasta_stem = cfg.fasta.stem
|
|
283
|
+
bed_stem = Path(cfg.fasta_regions_of_interest).stem
|
|
284
|
+
output_FASTA = cfg.output_directory / f"{fasta_stem}_subsampled_by_{bed_stem}.fasta"
|
|
285
|
+
|
|
286
|
+
logger.info("Subsampling FASTA records using the provided BED file")
|
|
287
|
+
subsample_fasta_from_bed(
|
|
288
|
+
cfg.fasta, cfg.fasta_regions_of_interest, cfg.output_directory, output_FASTA
|
|
289
|
+
)
|
|
290
|
+
fasta = output_FASTA
|
|
269
291
|
else:
|
|
292
|
+
logger.info("Using the full FASTA file")
|
|
270
293
|
fasta = cfg.fasta
|
|
271
294
|
|
|
272
295
|
# For conversion style SMF, make a converted reference FASTA
|
|
273
|
-
if cfg.smf_modality ==
|
|
274
|
-
|
|
275
|
-
converted_FASTA_basename =
|
|
296
|
+
if cfg.smf_modality == "conversion":
|
|
297
|
+
fasta_stem = fasta.stem
|
|
298
|
+
converted_FASTA_basename = f"{fasta_stem}_converted.fasta"
|
|
276
299
|
converted_FASTA = cfg.output_directory / converted_FASTA_basename
|
|
277
|
-
|
|
278
|
-
|
|
300
|
+
|
|
301
|
+
if "converted.fa" in fasta.name:
|
|
302
|
+
logger.info(f"{fasta} is already converted. Using existing converted FASTA.")
|
|
279
303
|
converted_FASTA = fasta
|
|
280
304
|
elif converted_FASTA.exists():
|
|
281
|
-
|
|
305
|
+
logger.info(f"{converted_FASTA} already exists. Using existing converted FASTA.")
|
|
282
306
|
else:
|
|
307
|
+
logger.info(f"Converting FASTA base sequences")
|
|
283
308
|
generate_converted_FASTA(fasta, cfg.conversion_types, cfg.strands, converted_FASTA)
|
|
284
309
|
fasta = converted_FASTA
|
|
285
310
|
|
|
@@ -290,121 +315,164 @@ def load_adata_core(cfg, paths: AdataPaths):
|
|
|
290
315
|
########################################################################################################################
|
|
291
316
|
|
|
292
317
|
################################### 3) Basecalling ###################################
|
|
293
|
-
|
|
318
|
+
|
|
294
319
|
# 1) Basecall using dorado
|
|
295
|
-
if basecall and cfg.sequencer ==
|
|
320
|
+
if basecall and cfg.sequencer == "ont":
|
|
296
321
|
try:
|
|
297
322
|
cfg.model_dir = Path(cfg.model_dir)
|
|
298
|
-
except:
|
|
299
|
-
|
|
323
|
+
except Exception:
|
|
324
|
+
logger.warning(
|
|
325
|
+
"Need to provide a valid path to a dorado model directory to use dorado basecalling"
|
|
326
|
+
)
|
|
300
327
|
if aligned_sorted_output.exists():
|
|
301
|
-
|
|
328
|
+
logger.info(
|
|
329
|
+
f"{aligned_sorted_output} already exists. Using existing basecalled, aligned, sorted BAM."
|
|
330
|
+
)
|
|
302
331
|
elif unaligned_output.exists():
|
|
303
|
-
|
|
304
|
-
elif cfg.smf_modality !=
|
|
305
|
-
|
|
332
|
+
logger.info(f"{unaligned_output} already exists. Using existing basecalled BAM.")
|
|
333
|
+
elif cfg.smf_modality != "direct":
|
|
334
|
+
logger.info("Running canonical basecalling using dorado")
|
|
335
|
+
canoncall(
|
|
336
|
+
str(cfg.model_dir),
|
|
337
|
+
cfg.model,
|
|
338
|
+
str(cfg.input_data_path),
|
|
339
|
+
cfg.barcode_kit,
|
|
340
|
+
str(bam),
|
|
341
|
+
cfg.bam_suffix,
|
|
342
|
+
cfg.barcode_both_ends,
|
|
343
|
+
cfg.trim,
|
|
344
|
+
cfg.device,
|
|
345
|
+
)
|
|
306
346
|
else:
|
|
307
|
-
|
|
347
|
+
logger.info("Running modified basecalling using dorado")
|
|
348
|
+
modcall(
|
|
349
|
+
str(cfg.model_dir),
|
|
350
|
+
cfg.model,
|
|
351
|
+
str(cfg.input_data_path),
|
|
352
|
+
cfg.barcode_kit,
|
|
353
|
+
cfg.mod_list,
|
|
354
|
+
str(bam),
|
|
355
|
+
cfg.bam_suffix,
|
|
356
|
+
cfg.barcode_both_ends,
|
|
357
|
+
cfg.trim,
|
|
358
|
+
cfg.device,
|
|
359
|
+
)
|
|
308
360
|
elif basecall:
|
|
309
|
-
|
|
361
|
+
logger.error("Basecalling is currently only supported for ont sequencers and not pacbio.")
|
|
310
362
|
else:
|
|
311
363
|
pass
|
|
312
364
|
########################################################################################################################
|
|
313
365
|
|
|
314
366
|
################################### 4) Alignment and sorting #############################################
|
|
315
|
-
|
|
316
|
-
from ..informatics.bed_functions import aligned_BAM_to_bed
|
|
367
|
+
|
|
317
368
|
# 3) Align the BAM to the reference FASTA and sort the bam on positional coordinates. Also make an index and a bed file of mapped reads
|
|
318
369
|
if aligned_sorted_output.exists():
|
|
319
|
-
|
|
370
|
+
logger.debug(f"{aligned_sorted_output} already exists. Using existing aligned/sorted BAM.")
|
|
320
371
|
else:
|
|
372
|
+
logger.info(f"Aligning and sorting reads")
|
|
321
373
|
align_and_sort_BAM(fasta, unaligned_output, cfg)
|
|
322
374
|
# Deleted the unsorted aligned output
|
|
323
375
|
aligned_output.unlink()
|
|
324
376
|
|
|
325
377
|
if cfg.make_beds:
|
|
326
378
|
# Make beds and provide basic histograms
|
|
327
|
-
bed_dir = cfg.output_directory /
|
|
379
|
+
bed_dir = cfg.output_directory / "beds"
|
|
328
380
|
if bed_dir.is_dir():
|
|
329
|
-
|
|
381
|
+
logger.debug(
|
|
382
|
+
f"{bed_dir} already exists. Skipping BAM -> BED conversion for {aligned_sorted_output}"
|
|
383
|
+
)
|
|
330
384
|
else:
|
|
331
|
-
|
|
385
|
+
logger.info("Making bed files from the aligned and sorted BAM file")
|
|
386
|
+
aligned_BAM_to_bed(
|
|
387
|
+
aligned_sorted_output, cfg.output_directory, fasta, cfg.make_bigwigs, cfg.threads
|
|
388
|
+
)
|
|
332
389
|
########################################################################################################################
|
|
333
390
|
|
|
334
391
|
################################### 5) Demultiplexing ######################################################################
|
|
335
|
-
|
|
392
|
+
|
|
336
393
|
# 3) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory
|
|
337
394
|
if cfg.input_already_demuxed:
|
|
338
395
|
if cfg.split_path.is_dir():
|
|
339
|
-
|
|
396
|
+
logger.debug(f"{cfg.split_path} already exists. Using existing demultiplexed BAMs.")
|
|
340
397
|
|
|
341
398
|
all_bam_files = sorted(
|
|
342
|
-
p for p in cfg.split_path.iterdir()
|
|
343
|
-
if p.is_file()
|
|
344
|
-
and p.suffix == cfg.bam_suffix
|
|
399
|
+
p for p in cfg.split_path.iterdir() if p.is_file() and p.suffix == cfg.bam_suffix
|
|
345
400
|
)
|
|
346
401
|
unclassified_bams = [p for p in all_bam_files if "unclassified" in p.name]
|
|
347
402
|
bam_files = [p for p in all_bam_files if "unclassified" not in p.name]
|
|
348
403
|
|
|
349
404
|
else:
|
|
350
405
|
make_dirs([cfg.split_path])
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
406
|
+
logger.info("Demultiplexing samples into individual aligned/sorted BAM files")
|
|
407
|
+
all_bam_files = split_and_index_BAM(aligned_sorted_BAM, cfg.split_path, cfg.bam_suffix)
|
|
408
|
+
|
|
355
409
|
unclassified_bams = [p for p in all_bam_files if "unclassified" in p.name]
|
|
356
410
|
bam_files = sorted(p for p in all_bam_files if "unclassified" not in p.name)
|
|
357
411
|
|
|
358
412
|
se_bam_files = bam_files
|
|
359
413
|
bam_dir = cfg.split_path
|
|
360
|
-
|
|
414
|
+
|
|
361
415
|
else:
|
|
362
416
|
if single_barcoded_path.is_dir():
|
|
363
|
-
|
|
417
|
+
logger.debug(
|
|
418
|
+
f"{single_barcoded_path} already exists. Using existing single ended demultiplexed BAMs."
|
|
419
|
+
)
|
|
364
420
|
|
|
365
421
|
all_se_bam_files = sorted(
|
|
366
|
-
p
|
|
367
|
-
|
|
368
|
-
and p.suffix == cfg.bam_suffix
|
|
369
|
-
)
|
|
422
|
+
p
|
|
423
|
+
for p in single_barcoded_path.iterdir()
|
|
424
|
+
if p.is_file() and p.suffix == cfg.bam_suffix
|
|
425
|
+
)
|
|
370
426
|
unclassified_se_bams = [p for p in all_se_bam_files if "unclassified" in p.name]
|
|
371
427
|
se_bam_files = [p for p in all_se_bam_files if "unclassified" not in p.name]
|
|
372
428
|
else:
|
|
373
|
-
make_dirs([cfg.split_path, single_barcoded_path])
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
429
|
+
make_dirs([cfg.split_path, single_barcoded_path])
|
|
430
|
+
logger.info(
|
|
431
|
+
"Demultiplexing samples into individual aligned/sorted BAM files based on single end barcode status with Dorado"
|
|
432
|
+
)
|
|
433
|
+
all_se_bam_files = demux_and_index_BAM(
|
|
434
|
+
aligned_sorted_BAM,
|
|
435
|
+
single_barcoded_path,
|
|
436
|
+
cfg.bam_suffix,
|
|
437
|
+
cfg.barcode_kit,
|
|
438
|
+
False,
|
|
439
|
+
cfg.trim,
|
|
440
|
+
cfg.threads,
|
|
441
|
+
)
|
|
442
|
+
|
|
382
443
|
unclassified_se_bams = [p for p in all_se_bam_files if "unclassified" in p.name]
|
|
383
444
|
se_bam_files = [p for p in all_se_bam_files if "unclassified" not in p.name]
|
|
384
|
-
|
|
445
|
+
|
|
385
446
|
if double_barcoded_path.is_dir():
|
|
386
|
-
|
|
447
|
+
logger.debug(
|
|
448
|
+
f"{double_barcoded_path} already exists. Using existing double ended demultiplexed BAMs."
|
|
449
|
+
)
|
|
387
450
|
|
|
388
451
|
all_de_bam_files = sorted(
|
|
389
|
-
p
|
|
390
|
-
|
|
391
|
-
and p.suffix == cfg.bam_suffix
|
|
392
|
-
)
|
|
452
|
+
p
|
|
453
|
+
for p in double_barcoded_path.iterdir()
|
|
454
|
+
if p.is_file() and p.suffix == cfg.bam_suffix
|
|
455
|
+
)
|
|
393
456
|
unclassified_de_bams = [p for p in all_de_bam_files if "unclassified" in p.name]
|
|
394
457
|
de_bam_files = [p for p in all_de_bam_files if "unclassified" not in p.name]
|
|
395
|
-
else:
|
|
396
|
-
make_dirs([cfg.split_path, double_barcoded_path])
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
458
|
+
else:
|
|
459
|
+
make_dirs([cfg.split_path, double_barcoded_path])
|
|
460
|
+
logger.info(
|
|
461
|
+
"Demultiplexing samples into individual aligned/sorted BAM files based on double end barcode status with Dorado"
|
|
462
|
+
)
|
|
463
|
+
all_de_bam_files = demux_and_index_BAM(
|
|
464
|
+
aligned_sorted_BAM,
|
|
465
|
+
double_barcoded_path,
|
|
466
|
+
cfg.bam_suffix,
|
|
467
|
+
cfg.barcode_kit,
|
|
468
|
+
True,
|
|
469
|
+
cfg.trim,
|
|
470
|
+
cfg.threads,
|
|
471
|
+
)
|
|
472
|
+
|
|
405
473
|
unclassified_de_bams = [p for p in all_de_bam_files if "unclassified" in p.name]
|
|
406
474
|
de_bam_files = [p for p in all_de_bam_files if "unclassified" not in p.name]
|
|
407
|
-
|
|
475
|
+
|
|
408
476
|
bam_files = se_bam_files + de_bam_files
|
|
409
477
|
unclassified_bams = unclassified_se_bams + unclassified_de_bams
|
|
410
478
|
bam_dir = single_barcoded_path
|
|
@@ -413,138 +481,192 @@ def load_adata_core(cfg, paths: AdataPaths):
|
|
|
413
481
|
|
|
414
482
|
if cfg.make_beds:
|
|
415
483
|
# Make beds and provide basic histograms
|
|
416
|
-
bed_dir = cfg.split_path /
|
|
484
|
+
bed_dir = cfg.split_path / "beds"
|
|
417
485
|
if bed_dir.is_dir():
|
|
418
|
-
|
|
486
|
+
logger.debug(
|
|
487
|
+
f"{bed_dir} already exists. Skipping BAM -> BED conversion for demultiplexed bams"
|
|
488
|
+
)
|
|
419
489
|
else:
|
|
490
|
+
logger.info("Making BED files from BAM files for each sample")
|
|
420
491
|
for bam in bam_files:
|
|
421
492
|
aligned_BAM_to_bed(bam, cfg.split_path, fasta, cfg.make_bigwigs, cfg.threads)
|
|
422
493
|
########################################################################################################################
|
|
423
494
|
|
|
424
495
|
################################### 6) SAMTools based BAM QC ######################################################################
|
|
425
|
-
|
|
496
|
+
|
|
426
497
|
# 5) Samtools QC metrics on split BAM files
|
|
427
498
|
bam_qc_dir = cfg.split_path / "bam_qc"
|
|
428
499
|
if bam_qc_dir.is_dir():
|
|
429
|
-
|
|
500
|
+
logger.debug(f"{bam_qc_dir} already exists. Using existing BAM QC calculations.")
|
|
430
501
|
else:
|
|
431
502
|
make_dirs([bam_qc_dir])
|
|
503
|
+
logger.info("Performing BAM QC")
|
|
432
504
|
bam_qc(bam_files, bam_qc_dir, cfg.threads, modality=cfg.smf_modality)
|
|
433
|
-
########################################################################################################################
|
|
505
|
+
########################################################################################################################
|
|
434
506
|
|
|
435
507
|
################################### 7) AnnData loading ######################################################################
|
|
436
|
-
if cfg.smf_modality !=
|
|
508
|
+
if cfg.smf_modality != "direct":
|
|
437
509
|
from ..informatics.converted_BAM_to_adata import converted_BAM_to_adata
|
|
510
|
+
|
|
438
511
|
# 6) Take the converted BAM and load it into an adata object.
|
|
439
|
-
if cfg.smf_modality ==
|
|
512
|
+
if cfg.smf_modality == "deaminase":
|
|
440
513
|
deaminase_footprinting = True
|
|
441
514
|
else:
|
|
442
515
|
deaminase_footprinting = False
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
516
|
+
|
|
517
|
+
logger.info(f"Loading Anndata from BAM files for {cfg.smf_modality} footprinting")
|
|
518
|
+
raw_adata, raw_adata_path = converted_BAM_to_adata(
|
|
519
|
+
fasta,
|
|
520
|
+
bam_dir,
|
|
521
|
+
cfg.output_directory,
|
|
522
|
+
cfg.input_already_demuxed,
|
|
523
|
+
cfg.mapping_threshold,
|
|
524
|
+
cfg.experiment_name,
|
|
525
|
+
cfg.conversion_types,
|
|
526
|
+
cfg.bam_suffix,
|
|
527
|
+
cfg.device,
|
|
528
|
+
cfg.threads,
|
|
529
|
+
deaminase_footprinting,
|
|
530
|
+
delete_intermediates=cfg.delete_intermediate_hdfs,
|
|
531
|
+
double_barcoded_path=double_barcoded_path,
|
|
532
|
+
)
|
|
456
533
|
else:
|
|
457
534
|
if mod_bed_dir.is_dir():
|
|
458
|
-
|
|
535
|
+
logger.debug(f"{mod_bed_dir} already exists, skipping making modbeds")
|
|
459
536
|
else:
|
|
460
|
-
from ..informatics.modkit_functions import
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
537
|
+
from ..informatics.modkit_functions import make_modbed, modQC
|
|
538
|
+
|
|
539
|
+
make_dirs([mod_bed_dir])
|
|
540
|
+
|
|
541
|
+
logger.info("Performing modQC for direct footprinting samples")
|
|
542
|
+
|
|
543
|
+
modQC(aligned_sorted_output, cfg.thresholds) # get QC metrics for mod calls
|
|
544
|
+
|
|
545
|
+
logger.info("Making modified BED files for direct footprinting samples")
|
|
546
|
+
|
|
547
|
+
make_modbed(
|
|
548
|
+
aligned_sorted_output, cfg.thresholds, mod_bed_dir
|
|
549
|
+
) # Generate bed files of position methylation summaries for every sample
|
|
550
|
+
|
|
470
551
|
from ..informatics.modkit_functions import extract_mods
|
|
552
|
+
|
|
471
553
|
make_dirs([mod_tsv_dir])
|
|
472
554
|
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
555
|
+
logger.info(
|
|
556
|
+
"Extracting single read modification states into TSVs for direct footprinting samples"
|
|
557
|
+
)
|
|
558
|
+
|
|
559
|
+
extract_mods(
|
|
560
|
+
cfg.thresholds,
|
|
561
|
+
mod_tsv_dir,
|
|
562
|
+
bam_dir,
|
|
563
|
+
cfg.bam_suffix,
|
|
564
|
+
skip_unclassified=cfg.skip_unclassified,
|
|
565
|
+
modkit_summary=False,
|
|
566
|
+
threads=cfg.threads,
|
|
567
|
+
) # Extract methylations calls for split BAM files into split TSV files
|
|
568
|
+
|
|
481
569
|
from ..informatics.modkit_extract_to_adata import modkit_extract_to_adata
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
570
|
+
|
|
571
|
+
logger.info("Making Anndata for direct modification detection SMF samples")
|
|
572
|
+
|
|
573
|
+
# 6 Load the modification data from TSVs into an adata object
|
|
574
|
+
raw_adata, raw_adata_path = modkit_extract_to_adata(
|
|
575
|
+
fasta,
|
|
576
|
+
bam_dir,
|
|
577
|
+
cfg.output_directory,
|
|
578
|
+
cfg.input_already_demuxed,
|
|
579
|
+
cfg.mapping_threshold,
|
|
580
|
+
cfg.experiment_name,
|
|
581
|
+
mods,
|
|
582
|
+
cfg.batch_size,
|
|
583
|
+
mod_tsv_dir,
|
|
584
|
+
cfg.delete_batch_hdfs,
|
|
585
|
+
cfg.threads,
|
|
586
|
+
double_barcoded_path,
|
|
587
|
+
)
|
|
495
588
|
if cfg.delete_intermediate_tsvs:
|
|
496
589
|
delete_tsvs(mod_tsv_dir)
|
|
497
590
|
|
|
498
|
-
raw_adata.obs[
|
|
499
|
-
raw_adata.obs[
|
|
591
|
+
raw_adata.obs["Experiment_name"] = [cfg.experiment_name] * raw_adata.shape[0]
|
|
592
|
+
raw_adata.obs["Experiment_name_and_barcode"] = (
|
|
593
|
+
raw_adata.obs["Experiment_name"].astype(str) + "_" + raw_adata.obs["Barcode"].astype(str)
|
|
594
|
+
)
|
|
500
595
|
|
|
501
596
|
########################################################################################################################
|
|
502
597
|
|
|
503
598
|
############################################### Add basic read length, read quality, mapping quality stats ###############################################
|
|
504
|
-
from ..informatics.h5ad_functions import add_read_length_and_mapping_qc
|
|
505
|
-
from ..informatics.bam_functions import extract_read_features_from_bam
|
|
506
|
-
add_read_length_and_mapping_qc(raw_adata, se_bam_files,
|
|
507
|
-
extract_read_features_from_bam_callable=extract_read_features_from_bam,
|
|
508
|
-
bypass=cfg.bypass_add_read_length_and_mapping_qc,
|
|
509
|
-
force_redo=cfg.force_redo_add_read_length_and_mapping_qc)
|
|
510
599
|
|
|
511
|
-
|
|
600
|
+
logger.info("Adding read length, mapping quality, and modification signal to Anndata")
|
|
601
|
+
add_read_length_and_mapping_qc(
|
|
602
|
+
raw_adata,
|
|
603
|
+
se_bam_files,
|
|
604
|
+
extract_read_features_from_bam_callable=extract_read_features_from_bam,
|
|
605
|
+
bypass=cfg.bypass_add_read_length_and_mapping_qc,
|
|
606
|
+
force_redo=cfg.force_redo_add_read_length_and_mapping_qc,
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
raw_adata.obs["Raw_modification_signal"] = np.nansum(raw_adata.X, axis=1)
|
|
610
|
+
########################################################################################################################
|
|
611
|
+
|
|
612
|
+
############################################### if input data type was pod5, append the pod5 file origin to each read ###############################################
|
|
613
|
+
from ..informatics.h5ad_functions import annotate_pod5_origin
|
|
614
|
+
|
|
615
|
+
if cfg.input_type == "pod5":
|
|
616
|
+
logger.info("Adding the POD5 origin file to each read into Anndata")
|
|
617
|
+
annotate_pod5_origin(
|
|
618
|
+
raw_adata,
|
|
619
|
+
cfg.input_data_path,
|
|
620
|
+
n_jobs=cfg.threads,
|
|
621
|
+
csv_path=output_directory / "read_to_pod5_origin_mapping.csv",
|
|
622
|
+
)
|
|
512
623
|
########################################################################################################################
|
|
513
624
|
|
|
514
625
|
############################################### Save final adata ###############################################
|
|
515
|
-
|
|
626
|
+
logger.info(f"Saving AnnData to {raw_adata_path}")
|
|
627
|
+
record_smftools_metadata(
|
|
628
|
+
raw_adata,
|
|
629
|
+
step_name="load",
|
|
630
|
+
cfg=cfg,
|
|
631
|
+
config_path=config_path,
|
|
632
|
+
output_path=raw_adata_path,
|
|
633
|
+
)
|
|
516
634
|
write_gz_h5ad(raw_adata, raw_adata_path)
|
|
517
635
|
########################################################################################################################
|
|
518
636
|
|
|
519
637
|
############################################### MultiQC HTML Report ###############################################
|
|
520
|
-
|
|
638
|
+
|
|
521
639
|
# multiqc ###
|
|
522
640
|
mqc_dir = cfg.split_path / "multiqc"
|
|
523
641
|
if mqc_dir.is_dir():
|
|
524
|
-
|
|
642
|
+
logger.debug(f"{mqc_dir} already exists, skipping multiqc")
|
|
525
643
|
else:
|
|
644
|
+
logger.info("Running multiqc")
|
|
526
645
|
run_multiqc(cfg.split_path, mqc_dir)
|
|
527
646
|
########################################################################################################################
|
|
528
647
|
|
|
529
648
|
############################################### delete intermediate BAM files ###############################################
|
|
530
649
|
if cfg.delete_intermediate_bams:
|
|
650
|
+
logger.info("Deleting intermediate BAM files")
|
|
531
651
|
# delete aligned and sorted bam
|
|
532
652
|
aligned_sorted_output.unlink()
|
|
533
|
-
bai = aligned_sorted_output.parent / (aligned_sorted_output.name +
|
|
653
|
+
bai = aligned_sorted_output.parent / (aligned_sorted_output.name + ".bai")
|
|
534
654
|
bai.unlink()
|
|
535
655
|
# delete the demultiplexed bams. Keep the demultiplexing summary files and directories to faciliate demultiplexing in the future with these files
|
|
536
656
|
for bam in bam_files:
|
|
537
|
-
bai = bam.parent / (bam.name +
|
|
657
|
+
bai = bam.parent / (bam.name + ".bai")
|
|
538
658
|
bam.unlink()
|
|
539
659
|
bai.unlink()
|
|
540
660
|
for bam in unclassified_bams:
|
|
541
|
-
bai = bam.parent / (bam.name +
|
|
661
|
+
bai = bam.parent / (bam.name + ".bai")
|
|
542
662
|
bam.unlink()
|
|
543
|
-
bai.unlink()
|
|
663
|
+
bai.unlink()
|
|
664
|
+
logger.info("Finished deleting intermediate BAM files")
|
|
544
665
|
########################################################################################################################
|
|
545
666
|
|
|
546
667
|
return raw_adata, raw_adata_path, cfg
|
|
547
668
|
|
|
669
|
+
|
|
548
670
|
def load_adata(config_path: str):
|
|
549
671
|
"""
|
|
550
672
|
CLI-facing wrapper for the load pipeline.
|
|
@@ -565,15 +687,11 @@ def load_adata(config_path: str):
|
|
|
565
687
|
cfg : ExperimentConfig
|
|
566
688
|
Config object for downstream steps.
|
|
567
689
|
"""
|
|
568
|
-
from importlib import resources
|
|
569
690
|
from datetime import datetime
|
|
570
|
-
from
|
|
571
|
-
|
|
572
|
-
import pandas as pd # used for summary file reading downstream if needed
|
|
573
|
-
|
|
574
|
-
from ..readwrite import make_dirs, add_or_update_column_in_csv
|
|
575
|
-
from ..config import LoadExperimentConfig, ExperimentConfig
|
|
691
|
+
from importlib import resources
|
|
576
692
|
|
|
693
|
+
from ..config import ExperimentConfig, LoadExperimentConfig
|
|
694
|
+
from ..readwrite import add_or_update_column_in_csv, make_dirs
|
|
577
695
|
from .helpers import get_adata_paths
|
|
578
696
|
|
|
579
697
|
date_str = datetime.today().strftime("%y%m%d")
|
|
@@ -613,25 +731,27 @@ def load_adata(config_path: str):
|
|
|
613
731
|
# -----------------------------
|
|
614
732
|
if not getattr(cfg, "force_redo_load_adata", False):
|
|
615
733
|
if paths.hmm.exists():
|
|
616
|
-
|
|
734
|
+
logger.debug(f"HMM AnnData already exists: {paths.hmm}\nSkipping smftools load")
|
|
617
735
|
return None, paths.hmm, cfg
|
|
618
736
|
if paths.spatial.exists():
|
|
619
|
-
|
|
737
|
+
logger.debug(f"Spatial AnnData already exists: {paths.spatial}\nSkipping smftools load")
|
|
620
738
|
return None, paths.spatial, cfg
|
|
621
739
|
if paths.pp_dedup.exists():
|
|
622
|
-
|
|
740
|
+
logger.debug(
|
|
623
741
|
f"Preprocessed deduplicated AnnData already exists: {paths.pp_dedup}\n"
|
|
624
742
|
f"Skipping smftools load"
|
|
625
743
|
)
|
|
626
744
|
return None, paths.pp_dedup, cfg
|
|
627
745
|
if paths.pp.exists():
|
|
628
|
-
|
|
746
|
+
logger.debug(f"Preprocessed AnnData already exists: {paths.pp}\nSkipping smftools load")
|
|
629
747
|
return None, paths.pp, cfg
|
|
630
748
|
if paths.raw.exists():
|
|
631
|
-
|
|
749
|
+
logger.debug(
|
|
750
|
+
f"Raw AnnData from smftools load already exists: {paths.raw}\nSkipping smftools load"
|
|
751
|
+
)
|
|
632
752
|
return None, paths.raw, cfg
|
|
633
753
|
|
|
634
754
|
# If we get here, we actually want to run the full load pipeline
|
|
635
|
-
adata, adata_path, cfg = load_adata_core(cfg, paths)
|
|
755
|
+
adata, adata_path, cfg = load_adata_core(cfg, paths, config_path=config_path)
|
|
636
756
|
|
|
637
|
-
return adata, adata_path, cfg
|
|
757
|
+
return adata, adata_path, cfg
|