smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +2 -6
- smftools/_version.py +1 -1
- smftools/cli/__init__.py +0 -0
- smftools/cli/archived/cli_flows.py +94 -0
- smftools/cli/helpers.py +48 -0
- smftools/cli/hmm_adata.py +361 -0
- smftools/cli/load_adata.py +637 -0
- smftools/cli/preprocess_adata.py +455 -0
- smftools/cli/spatial_adata.py +697 -0
- smftools/cli_entry.py +434 -0
- smftools/config/conversion.yaml +18 -6
- smftools/config/deaminase.yaml +18 -11
- smftools/config/default.yaml +151 -36
- smftools/config/direct.yaml +28 -1
- smftools/config/discover_input_files.py +115 -0
- smftools/config/experiment_config.py +225 -27
- smftools/hmm/HMM.py +12 -1
- smftools/hmm/__init__.py +0 -6
- smftools/hmm/archived/call_hmm_peaks.py +106 -0
- smftools/hmm/call_hmm_peaks.py +318 -90
- smftools/informatics/__init__.py +13 -7
- smftools/informatics/archived/fast5_to_pod5.py +43 -0
- smftools/informatics/archived/helpers/archived/__init__.py +71 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
- smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
- smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
- smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
- smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
- smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
- smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
- smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
- smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
- smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
- smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
- smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
- smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
- smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
- smftools/informatics/bam_functions.py +811 -0
- smftools/informatics/basecalling.py +67 -0
- smftools/informatics/bed_functions.py +366 -0
- smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
- smftools/informatics/fasta_functions.py +255 -0
- smftools/informatics/h5ad_functions.py +197 -0
- smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
- smftools/informatics/modkit_functions.py +129 -0
- smftools/informatics/ohe.py +160 -0
- smftools/informatics/pod5_functions.py +224 -0
- smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
- smftools/plotting/autocorrelation_plotting.py +1 -3
- smftools/plotting/general_plotting.py +1084 -363
- smftools/plotting/position_stats.py +3 -3
- smftools/preprocessing/__init__.py +4 -4
- smftools/preprocessing/append_base_context.py +35 -26
- smftools/preprocessing/append_binary_layer_by_base_context.py +6 -6
- smftools/preprocessing/binarize.py +17 -0
- smftools/preprocessing/binarize_on_Youden.py +11 -9
- smftools/preprocessing/calculate_complexity_II.py +1 -1
- smftools/preprocessing/calculate_coverage.py +16 -13
- smftools/preprocessing/calculate_position_Youden.py +42 -26
- smftools/preprocessing/calculate_read_modification_stats.py +2 -2
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +20 -20
- smftools/preprocessing/flag_duplicate_reads.py +2 -2
- smftools/preprocessing/invert_adata.py +1 -1
- smftools/preprocessing/load_sample_sheet.py +1 -1
- smftools/preprocessing/reindex_references_adata.py +37 -0
- smftools/readwrite.py +360 -140
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/METADATA +26 -19
- smftools-0.2.4.dist-info/RECORD +176 -0
- smftools-0.2.4.dist-info/entry_points.txt +2 -0
- smftools/cli.py +0 -184
- smftools/informatics/fast5_to_pod5.py +0 -24
- smftools/informatics/helpers/__init__.py +0 -73
- smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
- smftools/informatics/helpers/bam_qc.py +0 -66
- smftools/informatics/helpers/bed_to_bigwig.py +0 -39
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
- smftools/informatics/helpers/discover_input_files.py +0 -100
- smftools/informatics/helpers/index_fasta.py +0 -12
- smftools/informatics/helpers/make_dirs.py +0 -21
- smftools/informatics/readwrite.py +0 -106
- smftools/informatics/subsample_fasta_from_bed.py +0 -47
- smftools/load_adata.py +0 -1346
- smftools-0.2.1.dist-info/RECORD +0 -161
- smftools-0.2.1.dist-info/entry_points.txt +0 -2
- /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
- /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
- /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
- /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
- /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
- /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
- /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
- /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
- /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,637 @@
|
|
|
1
|
+
import shutil
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Union, Iterable
|
|
4
|
+
|
|
5
|
+
from .helpers import AdataPaths
|
|
6
|
+
|
|
7
|
+
def check_executable_exists(cmd: str) -> bool:
|
|
8
|
+
"""Return True if a command-line executable is available in PATH."""
|
|
9
|
+
return shutil.which(cmd) is not None
|
|
10
|
+
|
|
11
|
+
def delete_tsvs(
|
|
12
|
+
tsv_dir: Union[str, Path, Iterable[str], None],
|
|
13
|
+
*,
|
|
14
|
+
dry_run: bool = False,
|
|
15
|
+
verbose: bool = True,
|
|
16
|
+
):
|
|
17
|
+
"""
|
|
18
|
+
Delete intermediate tsv files.
|
|
19
|
+
|
|
20
|
+
Parameters
|
|
21
|
+
----------
|
|
22
|
+
|
|
23
|
+
tsv_dir : str | Path | None
|
|
24
|
+
Path to a directory to remove recursively (e.g. a tsv dir created earlier).
|
|
25
|
+
dry_run : bool
|
|
26
|
+
If True, print what *would* be removed but do not actually delete.
|
|
27
|
+
verbose : bool
|
|
28
|
+
Print progress / warnings.
|
|
29
|
+
"""
|
|
30
|
+
# Helper: remove a single file path (Path-like or string)
|
|
31
|
+
def _maybe_unlink(p: Path):
|
|
32
|
+
if not p.exists():
|
|
33
|
+
if verbose:
|
|
34
|
+
print(f"[skip] not found: {p}")
|
|
35
|
+
return
|
|
36
|
+
if not p.is_file():
|
|
37
|
+
if verbose:
|
|
38
|
+
print(f"[skip] not a file: {p}")
|
|
39
|
+
return
|
|
40
|
+
if dry_run:
|
|
41
|
+
print(f"[dry-run] would remove file: {p}")
|
|
42
|
+
return
|
|
43
|
+
try:
|
|
44
|
+
p.unlink()
|
|
45
|
+
if verbose:
|
|
46
|
+
print(f"Removed file: {p}")
|
|
47
|
+
except Exception as e:
|
|
48
|
+
print(f"[error] failed to remove file {p}: {e}")
|
|
49
|
+
|
|
50
|
+
# Remove tmp_dir recursively (if provided)
|
|
51
|
+
if tsv_dir is not None:
|
|
52
|
+
td = Path(tsv_dir)
|
|
53
|
+
if not td.exists():
|
|
54
|
+
if verbose:
|
|
55
|
+
print(f"[skip] tsv_dir not found: {td}")
|
|
56
|
+
else:
|
|
57
|
+
if not td.is_dir():
|
|
58
|
+
if verbose:
|
|
59
|
+
print(f"[skip] tsv_dir is not a directory: {td}")
|
|
60
|
+
else:
|
|
61
|
+
if dry_run:
|
|
62
|
+
print(f"[dry-run] would remove directory tree: {td}")
|
|
63
|
+
else:
|
|
64
|
+
try:
|
|
65
|
+
shutil.rmtree(td)
|
|
66
|
+
if verbose:
|
|
67
|
+
print(f"Removed directory tree: {td}")
|
|
68
|
+
except Exception as e:
|
|
69
|
+
print(f"[error] failed to remove tmp dir {td}: {e}")
|
|
70
|
+
|
|
71
|
+
def load_adata_core(cfg, paths: AdataPaths):
|
|
72
|
+
"""
|
|
73
|
+
Core load pipeline.
|
|
74
|
+
|
|
75
|
+
Assumes:
|
|
76
|
+
- cfg is a fully initialized ExperimentConfig
|
|
77
|
+
- paths is an AdataPaths object describing canonical h5ad stage paths
|
|
78
|
+
- No stage-skipping or early returns based on existing AnnDatas are done here
|
|
79
|
+
(that happens in the wrapper).
|
|
80
|
+
|
|
81
|
+
Does:
|
|
82
|
+
- handle input format (fast5/pod5/fastq/bam/h5ad)
|
|
83
|
+
- basecalling / alignment / demux / BAM QC
|
|
84
|
+
- optional bed + bigwig generation
|
|
85
|
+
- AnnData construction (conversion or direct modality)
|
|
86
|
+
- basic read-level QC annotations
|
|
87
|
+
- write raw AnnData to paths.raw
|
|
88
|
+
- run MultiQC
|
|
89
|
+
- optional deletion of intermediate BAMs
|
|
90
|
+
|
|
91
|
+
Returns
|
|
92
|
+
-------
|
|
93
|
+
raw_adata : anndata.AnnData
|
|
94
|
+
Newly created raw AnnData object.
|
|
95
|
+
raw_adata_path : Path
|
|
96
|
+
Path where the raw AnnData was written (paths.raw).
|
|
97
|
+
cfg : ExperimentConfig
|
|
98
|
+
(Same object, possibly with some fields updated, e.g. fasta path.)
|
|
99
|
+
"""
|
|
100
|
+
import os
|
|
101
|
+
from pathlib import Path
|
|
102
|
+
|
|
103
|
+
import numpy as np
|
|
104
|
+
import pandas as pd
|
|
105
|
+
import anndata as ad
|
|
106
|
+
import scanpy as sc
|
|
107
|
+
|
|
108
|
+
from .helpers import write_gz_h5ad
|
|
109
|
+
|
|
110
|
+
from ..readwrite import make_dirs, add_or_update_column_in_csv
|
|
111
|
+
|
|
112
|
+
from ..informatics.bam_functions import concatenate_fastqs_to_bam, align_and_sort_BAM, demux_and_index_BAM, split_and_index_BAM, bam_qc, extract_read_features_from_bam
|
|
113
|
+
from ..informatics.bed_functions import aligned_BAM_to_bed
|
|
114
|
+
from ..informatics.pod5_functions import fast5_to_pod5
|
|
115
|
+
from ..informatics.fasta_functions import subsample_fasta_from_bed, generate_converted_FASTA, get_chromosome_lengths
|
|
116
|
+
from ..informatics.basecalling import modcall, canoncall
|
|
117
|
+
from ..informatics.modkit_functions import modQC, make_modbed, extract_mods
|
|
118
|
+
from ..informatics.modkit_extract_to_adata import modkit_extract_to_adata
|
|
119
|
+
from ..informatics.converted_BAM_to_adata import converted_BAM_to_adata
|
|
120
|
+
from ..informatics.h5ad_functions import add_read_length_and_mapping_qc
|
|
121
|
+
from ..informatics.run_multiqc import run_multiqc
|
|
122
|
+
|
|
123
|
+
################################### 1) General params and input organization ###################################
|
|
124
|
+
output_directory = Path(cfg.output_directory)
|
|
125
|
+
make_dirs([output_directory])
|
|
126
|
+
|
|
127
|
+
raw_adata_path = paths.raw
|
|
128
|
+
pp_adata_path = paths.pp
|
|
129
|
+
pp_dup_rem_adata_path = paths.pp_dedup
|
|
130
|
+
spatial_adata_path = paths.spatial
|
|
131
|
+
hmm_adata_path = paths.hmm
|
|
132
|
+
|
|
133
|
+
# Naming of the demultiplexed output directory
|
|
134
|
+
double_barcoded_path = cfg.split_path / "both_ends_barcoded"
|
|
135
|
+
single_barcoded_path = cfg.split_path / "at_least_one_end_barcoded"
|
|
136
|
+
|
|
137
|
+
# Direct methylation detection SMF specific parameters
|
|
138
|
+
if cfg.smf_modality == "direct":
|
|
139
|
+
mod_bed_dir = cfg.output_directory / "mod_beds"
|
|
140
|
+
add_or_update_column_in_csv(cfg.summary_file, "mod_bed_dir", mod_bed_dir)
|
|
141
|
+
mod_tsv_dir = cfg.output_directory / "mod_tsvs"
|
|
142
|
+
add_or_update_column_in_csv(cfg.summary_file, "mod_tsv_dir", mod_tsv_dir)
|
|
143
|
+
bam_qc_dir = cfg.output_directory / "bam_qc"
|
|
144
|
+
mods = [cfg.mod_map[mod] for mod in cfg.mod_list]
|
|
145
|
+
|
|
146
|
+
if not check_executable_exists("dorado"):
|
|
147
|
+
raise RuntimeError(
|
|
148
|
+
"Error: 'dorado' is not installed or not in PATH. "
|
|
149
|
+
"Install from https://github.com/nanoporetech/dorado"
|
|
150
|
+
)
|
|
151
|
+
if not check_executable_exists("modkit"):
|
|
152
|
+
raise RuntimeError(
|
|
153
|
+
"Error: 'modkit' is not installed or not in PATH. "
|
|
154
|
+
"Install from https://github.com/nanoporetech/modkit"
|
|
155
|
+
)
|
|
156
|
+
else:
|
|
157
|
+
mod_bed_dir = None
|
|
158
|
+
mod_tsv_dir = None
|
|
159
|
+
mods = None
|
|
160
|
+
|
|
161
|
+
# demux / aligner executables
|
|
162
|
+
if (not cfg.input_already_demuxed) or cfg.aligner == "dorado":
|
|
163
|
+
if not check_executable_exists("dorado"):
|
|
164
|
+
raise RuntimeError(
|
|
165
|
+
"Error: 'dorado' is not installed or not in PATH. "
|
|
166
|
+
"Install from https://github.com/nanoporetech/dorado"
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
if cfg.aligner == "minimap2":
|
|
170
|
+
if not check_executable_exists("minimap2"):
|
|
171
|
+
raise RuntimeError(
|
|
172
|
+
"Error: 'minimap2' is not installed or not in PATH. "
|
|
173
|
+
"Install minimap2"
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
# # Detect the input filetypes
|
|
177
|
+
# If the input files are fast5 files, convert the files to a pod5 file before proceeding.
|
|
178
|
+
if cfg.input_type == "fast5":
|
|
179
|
+
# take the input directory of fast5 files and write out a single pod5 file into the output directory.
|
|
180
|
+
output_pod5 = cfg.output_directory / 'FAST5s_to_POD5.pod5'
|
|
181
|
+
if output_pod5.exists():
|
|
182
|
+
pass
|
|
183
|
+
else:
|
|
184
|
+
print(f'Input directory contains fast5 files, converting them and concatenating into a single pod5 file in the {output_pod5}')
|
|
185
|
+
fast5_to_pod5(cfg.input_data_path, output_pod5)
|
|
186
|
+
# Reassign the pod5_dir variable to point to the new pod5 file.
|
|
187
|
+
cfg.input_data_path = output_pod5
|
|
188
|
+
cfg.input_type = "pod5"
|
|
189
|
+
# If the input is a fastq or a directory of fastqs, concatenate them into an unaligned BAM and save the barcode
|
|
190
|
+
elif cfg.input_type == "fastq":
|
|
191
|
+
# Output file for FASTQ concatenation.
|
|
192
|
+
output_bam = cfg.output_directory / 'canonical_basecalls.bam'
|
|
193
|
+
if output_bam.exists():
|
|
194
|
+
pass
|
|
195
|
+
else:
|
|
196
|
+
summary = concatenate_fastqs_to_bam(
|
|
197
|
+
cfg.input_files,
|
|
198
|
+
output_bam,
|
|
199
|
+
barcode_tag='BC',
|
|
200
|
+
gzip_suffixes=('.gz','.gzip'),
|
|
201
|
+
barcode_map=cfg.fastq_barcode_map,
|
|
202
|
+
add_read_group=True,
|
|
203
|
+
rg_sample_field=None,
|
|
204
|
+
progress=False,
|
|
205
|
+
auto_pair=cfg.fastq_auto_pairing)
|
|
206
|
+
|
|
207
|
+
print(f"Found the following barcodes: {summary['barcodes']}")
|
|
208
|
+
|
|
209
|
+
# Set the input data path to the concatenated BAM.
|
|
210
|
+
cfg.input_data_path = output_bam
|
|
211
|
+
cfg.input_type = "bam"
|
|
212
|
+
elif cfg.input_type == "h5ad":
|
|
213
|
+
pass
|
|
214
|
+
else:
|
|
215
|
+
pass
|
|
216
|
+
|
|
217
|
+
add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
|
|
218
|
+
|
|
219
|
+
# Determine if the input data needs to be basecalled
|
|
220
|
+
if cfg.input_type == "pod5":
|
|
221
|
+
print(f'Detected pod5 inputs: {cfg.input_files}')
|
|
222
|
+
basecall = True
|
|
223
|
+
elif cfg.input_type in ["bam"]:
|
|
224
|
+
print(f'Detected bam input: {cfg.input_files}')
|
|
225
|
+
basecall = False
|
|
226
|
+
else:
|
|
227
|
+
print('Error, can not find input bam or pod5')
|
|
228
|
+
|
|
229
|
+
# Generate the base name of the unaligned bam without the .bam suffix
|
|
230
|
+
if basecall:
|
|
231
|
+
model_basename = Path(cfg.model).name
|
|
232
|
+
model_basename = str(model_basename).replace('.', '_')
|
|
233
|
+
if cfg.smf_modality == 'direct':
|
|
234
|
+
mod_string = "_".join(cfg.mod_list)
|
|
235
|
+
bam = cfg.output_directory / f"{model_basename}_{mod_string}_calls"
|
|
236
|
+
else:
|
|
237
|
+
bam = cfg.output_directory / f"{model_basename}_canonical_basecalls"
|
|
238
|
+
else:
|
|
239
|
+
bam_base = cfg.input_data_path.name
|
|
240
|
+
bam = cfg.output_directory / bam_base
|
|
241
|
+
|
|
242
|
+
# Generate path names for the unaligned, aligned, as well as the aligned/sorted bam.
|
|
243
|
+
unaligned_output = bam.with_suffix(cfg.bam_suffix)
|
|
244
|
+
aligned_BAM = cfg.output_directory / (bam.stem + "_aligned") # doing this allows specifying an input bam in a seperate directory as the aligned output bams
|
|
245
|
+
aligned_output = aligned_BAM.with_suffix(cfg.bam_suffix)
|
|
246
|
+
aligned_sorted_BAM = aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
|
|
247
|
+
aligned_sorted_output = aligned_sorted_BAM.with_suffix(cfg.bam_suffix)
|
|
248
|
+
|
|
249
|
+
add_or_update_column_in_csv(cfg.summary_file, "basecalled_bam", unaligned_output)
|
|
250
|
+
add_or_update_column_in_csv(cfg.summary_file, "aligned_bam", aligned_output)
|
|
251
|
+
add_or_update_column_in_csv(cfg.summary_file, "sorted_bam", aligned_sorted_output)
|
|
252
|
+
########################################################################################################################
|
|
253
|
+
|
|
254
|
+
################################### 2) FASTA Handling ###################################
|
|
255
|
+
from ..informatics.fasta_functions import generate_converted_FASTA, get_chromosome_lengths
|
|
256
|
+
|
|
257
|
+
try:
|
|
258
|
+
cfg.fasta = Path(cfg.fasta)
|
|
259
|
+
except:
|
|
260
|
+
print("Need to provide an input FASTA path to proceed with smftools load")
|
|
261
|
+
|
|
262
|
+
# If fasta_regions_of_interest bed is passed, subsample the input FASTA on regions of interest and use the subsampled FASTA.
|
|
263
|
+
if cfg.fasta_regions_of_interest and '.bed' in cfg.fasta_regions_of_interest:
|
|
264
|
+
fasta_basename = cfg.fasta.parent / cfg.fasta.stem
|
|
265
|
+
bed_basename_minus_suffix = Path(cfg.fasta_regions_of_interest).stem
|
|
266
|
+
output_FASTA = fasta_basename.with_name(fasta_basename.name + '_subsampled_by_' + bed_basename_minus_suffix + '.fasta')
|
|
267
|
+
subsample_fasta_from_bed(cfg.fasta, cfg.fasta_regions_of_interest, cfg.output_directory, output_FASTA)
|
|
268
|
+
fasta = cfg.output_directory / output_FASTA
|
|
269
|
+
else:
|
|
270
|
+
fasta = cfg.fasta
|
|
271
|
+
|
|
272
|
+
# For conversion style SMF, make a converted reference FASTA
|
|
273
|
+
if cfg.smf_modality == 'conversion':
|
|
274
|
+
fasta_basename = fasta.parent / fasta.stem
|
|
275
|
+
converted_FASTA_basename = fasta_basename.with_name(fasta_basename.name + '_converted.fasta')
|
|
276
|
+
converted_FASTA = cfg.output_directory / converted_FASTA_basename
|
|
277
|
+
if 'converted.fa' in fasta.name:
|
|
278
|
+
print(f'{fasta} is already converted. Using existing converted FASTA.')
|
|
279
|
+
converted_FASTA = fasta
|
|
280
|
+
elif converted_FASTA.exists():
|
|
281
|
+
print(f'{converted_FASTA} already exists. Using existing converted FASTA.')
|
|
282
|
+
else:
|
|
283
|
+
generate_converted_FASTA(fasta, cfg.conversion_types, cfg.strands, converted_FASTA)
|
|
284
|
+
fasta = converted_FASTA
|
|
285
|
+
|
|
286
|
+
add_or_update_column_in_csv(cfg.summary_file, "fasta", fasta)
|
|
287
|
+
|
|
288
|
+
# Make a FAI and .chrom.names file for the fasta
|
|
289
|
+
get_chromosome_lengths(fasta)
|
|
290
|
+
########################################################################################################################
|
|
291
|
+
|
|
292
|
+
################################### 3) Basecalling ###################################
|
|
293
|
+
from ..informatics.basecalling import modcall, canoncall
|
|
294
|
+
# 1) Basecall using dorado
|
|
295
|
+
if basecall and cfg.sequencer == 'ont':
|
|
296
|
+
try:
|
|
297
|
+
cfg.model_dir = Path(cfg.model_dir)
|
|
298
|
+
except:
|
|
299
|
+
print("Need to provide a valid path to a dorado model directory to use dorado basecalling")
|
|
300
|
+
if aligned_sorted_output.exists():
|
|
301
|
+
print(f'{aligned_sorted_output} already exists. Using existing basecalled, aligned, sorted BAM.')
|
|
302
|
+
elif unaligned_output.exists():
|
|
303
|
+
print(f'{unaligned_output} already exists. Using existing basecalled BAM.')
|
|
304
|
+
elif cfg.smf_modality != 'direct':
|
|
305
|
+
canoncall(str(cfg.model_dir), cfg.model, str(cfg.input_data_path), cfg.barcode_kit, str(bam), cfg.bam_suffix, cfg.barcode_both_ends, cfg.trim, cfg.device)
|
|
306
|
+
else:
|
|
307
|
+
modcall(str(cfg.model_dir), cfg.model, str(cfg.input_data_path), cfg.barcode_kit, cfg.mod_list, str(bam), cfg.bam_suffix, cfg.barcode_both_ends, cfg.trim, cfg.device)
|
|
308
|
+
elif basecall:
|
|
309
|
+
print(f"Basecalling is currently only supported for ont sequencers and not pacbio.")
|
|
310
|
+
else:
|
|
311
|
+
pass
|
|
312
|
+
########################################################################################################################
|
|
313
|
+
|
|
314
|
+
################################### 4) Alignment and sorting #############################################
|
|
315
|
+
from ..informatics.bam_functions import align_and_sort_BAM
|
|
316
|
+
from ..informatics.bed_functions import aligned_BAM_to_bed
|
|
317
|
+
# 3) Align the BAM to the reference FASTA and sort the bam on positional coordinates. Also make an index and a bed file of mapped reads
|
|
318
|
+
if aligned_sorted_output.exists():
|
|
319
|
+
print(f'{aligned_sorted_output} already exists. Using existing aligned/sorted BAM.')
|
|
320
|
+
else:
|
|
321
|
+
align_and_sort_BAM(fasta, unaligned_output, cfg)
|
|
322
|
+
# Deleted the unsorted aligned output
|
|
323
|
+
aligned_output.unlink()
|
|
324
|
+
|
|
325
|
+
if cfg.make_beds:
|
|
326
|
+
# Make beds and provide basic histograms
|
|
327
|
+
bed_dir = cfg.output_directory / 'beds'
|
|
328
|
+
if bed_dir.is_dir():
|
|
329
|
+
print(f'{bed_dir} already exists. Skipping BAM -> BED conversion for {aligned_sorted_output}')
|
|
330
|
+
else:
|
|
331
|
+
aligned_BAM_to_bed(aligned_sorted_output, cfg.output_directory, fasta, cfg.make_bigwigs, cfg.threads)
|
|
332
|
+
########################################################################################################################
|
|
333
|
+
|
|
334
|
+
################################### 5) Demultiplexing ######################################################################
|
|
335
|
+
from ..informatics.bam_functions import demux_and_index_BAM, split_and_index_BAM
|
|
336
|
+
# 3) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory
|
|
337
|
+
if cfg.input_already_demuxed:
|
|
338
|
+
if cfg.split_path.is_dir():
|
|
339
|
+
print(f"{cfg.split_path} already exists. Using existing demultiplexed BAMs.")
|
|
340
|
+
|
|
341
|
+
all_bam_files = sorted(
|
|
342
|
+
p for p in cfg.split_path.iterdir()
|
|
343
|
+
if p.is_file()
|
|
344
|
+
and p.suffix == cfg.bam_suffix
|
|
345
|
+
)
|
|
346
|
+
unclassified_bams = [p for p in all_bam_files if "unclassified" in p.name]
|
|
347
|
+
bam_files = [p for p in all_bam_files if "unclassified" not in p.name]
|
|
348
|
+
|
|
349
|
+
else:
|
|
350
|
+
make_dirs([cfg.split_path])
|
|
351
|
+
all_bam_files = split_and_index_BAM(aligned_sorted_BAM,
|
|
352
|
+
cfg.split_path,
|
|
353
|
+
cfg.bam_suffix)
|
|
354
|
+
|
|
355
|
+
unclassified_bams = [p for p in all_bam_files if "unclassified" in p.name]
|
|
356
|
+
bam_files = sorted(p for p in all_bam_files if "unclassified" not in p.name)
|
|
357
|
+
|
|
358
|
+
se_bam_files = bam_files
|
|
359
|
+
bam_dir = cfg.split_path
|
|
360
|
+
|
|
361
|
+
else:
|
|
362
|
+
if single_barcoded_path.is_dir():
|
|
363
|
+
print(f"{single_barcoded_path} already exists. Using existing single ended demultiplexed BAMs.")
|
|
364
|
+
|
|
365
|
+
all_se_bam_files = sorted(
|
|
366
|
+
p for p in single_barcoded_path.iterdir()
|
|
367
|
+
if p.is_file()
|
|
368
|
+
and p.suffix == cfg.bam_suffix
|
|
369
|
+
)
|
|
370
|
+
unclassified_se_bams = [p for p in all_se_bam_files if "unclassified" in p.name]
|
|
371
|
+
se_bam_files = [p for p in all_se_bam_files if "unclassified" not in p.name]
|
|
372
|
+
else:
|
|
373
|
+
make_dirs([cfg.split_path, single_barcoded_path])
|
|
374
|
+
all_se_bam_files = demux_and_index_BAM(aligned_sorted_BAM,
|
|
375
|
+
single_barcoded_path,
|
|
376
|
+
cfg.bam_suffix,
|
|
377
|
+
cfg.barcode_kit,
|
|
378
|
+
False,
|
|
379
|
+
cfg.trim,
|
|
380
|
+
cfg.threads)
|
|
381
|
+
|
|
382
|
+
unclassified_se_bams = [p for p in all_se_bam_files if "unclassified" in p.name]
|
|
383
|
+
se_bam_files = [p for p in all_se_bam_files if "unclassified" not in p.name]
|
|
384
|
+
|
|
385
|
+
if double_barcoded_path.is_dir():
|
|
386
|
+
print(f"{double_barcoded_path} already exists. Using existing double ended demultiplexed BAMs.")
|
|
387
|
+
|
|
388
|
+
all_de_bam_files = sorted(
|
|
389
|
+
p for p in double_barcoded_path.iterdir()
|
|
390
|
+
if p.is_file()
|
|
391
|
+
and p.suffix == cfg.bam_suffix
|
|
392
|
+
)
|
|
393
|
+
unclassified_de_bams = [p for p in all_de_bam_files if "unclassified" in p.name]
|
|
394
|
+
de_bam_files = [p for p in all_de_bam_files if "unclassified" not in p.name]
|
|
395
|
+
else:
|
|
396
|
+
make_dirs([cfg.split_path, double_barcoded_path])
|
|
397
|
+
all_de_bam_files = demux_and_index_BAM(aligned_sorted_BAM,
|
|
398
|
+
double_barcoded_path,
|
|
399
|
+
cfg.bam_suffix,
|
|
400
|
+
cfg.barcode_kit,
|
|
401
|
+
True,
|
|
402
|
+
cfg.trim,
|
|
403
|
+
cfg.threads)
|
|
404
|
+
|
|
405
|
+
unclassified_de_bams = [p for p in all_de_bam_files if "unclassified" in p.name]
|
|
406
|
+
de_bam_files = [p for p in all_de_bam_files if "unclassified" not in p.name]
|
|
407
|
+
|
|
408
|
+
bam_files = se_bam_files + de_bam_files
|
|
409
|
+
unclassified_bams = unclassified_se_bams + unclassified_de_bams
|
|
410
|
+
bam_dir = single_barcoded_path
|
|
411
|
+
|
|
412
|
+
add_or_update_column_in_csv(cfg.summary_file, "demuxed_bams", [se_bam_files])
|
|
413
|
+
|
|
414
|
+
if cfg.make_beds:
|
|
415
|
+
# Make beds and provide basic histograms
|
|
416
|
+
bed_dir = cfg.split_path / 'beds'
|
|
417
|
+
if bed_dir.is_dir():
|
|
418
|
+
print(f'{bed_dir} already exists. Skipping BAM -> BED conversion for demultiplexed bams')
|
|
419
|
+
else:
|
|
420
|
+
for bam in bam_files:
|
|
421
|
+
aligned_BAM_to_bed(bam, cfg.split_path, fasta, cfg.make_bigwigs, cfg.threads)
|
|
422
|
+
########################################################################################################################
|
|
423
|
+
|
|
424
|
+
################################### 6) SAMTools based BAM QC ######################################################################
|
|
425
|
+
from ..informatics.bam_functions import bam_qc
|
|
426
|
+
# 5) Samtools QC metrics on split BAM files
|
|
427
|
+
bam_qc_dir = cfg.split_path / "bam_qc"
|
|
428
|
+
if bam_qc_dir.is_dir():
|
|
429
|
+
print( f'{bam_qc_dir} already exists. Using existing BAM QC calculations.')
|
|
430
|
+
else:
|
|
431
|
+
make_dirs([bam_qc_dir])
|
|
432
|
+
bam_qc(bam_files, bam_qc_dir, cfg.threads, modality=cfg.smf_modality)
|
|
433
|
+
########################################################################################################################
|
|
434
|
+
|
|
435
|
+
################################### 7) AnnData loading ######################################################################
|
|
436
|
+
if cfg.smf_modality != 'direct':
|
|
437
|
+
from ..informatics.converted_BAM_to_adata import converted_BAM_to_adata
|
|
438
|
+
# 6) Take the converted BAM and load it into an adata object.
|
|
439
|
+
if cfg.smf_modality == 'deaminase':
|
|
440
|
+
deaminase_footprinting = True
|
|
441
|
+
else:
|
|
442
|
+
deaminase_footprinting = False
|
|
443
|
+
raw_adata, raw_adata_path = converted_BAM_to_adata(fasta,
|
|
444
|
+
bam_dir,
|
|
445
|
+
cfg.output_directory,
|
|
446
|
+
cfg.input_already_demuxed,
|
|
447
|
+
cfg.mapping_threshold,
|
|
448
|
+
cfg.experiment_name,
|
|
449
|
+
cfg.conversion_types,
|
|
450
|
+
cfg.bam_suffix,
|
|
451
|
+
cfg.device,
|
|
452
|
+
cfg.threads,
|
|
453
|
+
deaminase_footprinting,
|
|
454
|
+
delete_intermediates=cfg.delete_intermediate_hdfs,
|
|
455
|
+
double_barcoded_path=double_barcoded_path)
|
|
456
|
+
else:
|
|
457
|
+
if mod_bed_dir.is_dir():
|
|
458
|
+
print(f'{mod_bed_dir} already exists, skipping making modbeds')
|
|
459
|
+
else:
|
|
460
|
+
from ..informatics.modkit_functions import modQC, make_modbed
|
|
461
|
+
make_dirs([mod_bed_dir])
|
|
462
|
+
|
|
463
|
+
modQC(aligned_sorted_output,
|
|
464
|
+
cfg.thresholds) # get QC metrics for mod calls
|
|
465
|
+
|
|
466
|
+
make_modbed(aligned_sorted_output,
|
|
467
|
+
cfg.thresholds,
|
|
468
|
+
mod_bed_dir) # Generate bed files of position methylation summaries for every sample
|
|
469
|
+
|
|
470
|
+
from ..informatics.modkit_functions import extract_mods
|
|
471
|
+
make_dirs([mod_tsv_dir])
|
|
472
|
+
|
|
473
|
+
extract_mods(cfg.thresholds,
|
|
474
|
+
mod_tsv_dir,
|
|
475
|
+
bam_dir,
|
|
476
|
+
cfg.bam_suffix,
|
|
477
|
+
skip_unclassified=cfg.skip_unclassified,
|
|
478
|
+
modkit_summary=False,
|
|
479
|
+
threads=cfg.threads) # Extract methylations calls for split BAM files into split TSV files
|
|
480
|
+
|
|
481
|
+
from ..informatics.modkit_extract_to_adata import modkit_extract_to_adata
|
|
482
|
+
#6 Load the modification data from TSVs into an adata object
|
|
483
|
+
raw_adata, raw_adata_path = modkit_extract_to_adata(fasta,
|
|
484
|
+
bam_dir,
|
|
485
|
+
cfg.output_directory,
|
|
486
|
+
cfg.input_already_demuxed,
|
|
487
|
+
cfg.mapping_threshold,
|
|
488
|
+
cfg.experiment_name,
|
|
489
|
+
mods,
|
|
490
|
+
cfg.batch_size,
|
|
491
|
+
mod_tsv_dir,
|
|
492
|
+
cfg.delete_batch_hdfs,
|
|
493
|
+
cfg.threads,
|
|
494
|
+
double_barcoded_path)
|
|
495
|
+
if cfg.delete_intermediate_tsvs:
|
|
496
|
+
delete_tsvs(mod_tsv_dir)
|
|
497
|
+
|
|
498
|
+
raw_adata.obs['Experiment_name'] = [cfg.experiment_name] * raw_adata.shape[0]
|
|
499
|
+
raw_adata.obs['Experiment_name_and_barcode'] = (raw_adata.obs['Experiment_name'].astype(str) + "_" + raw_adata.obs['Barcode'].astype(str))
|
|
500
|
+
|
|
501
|
+
########################################################################################################################
|
|
502
|
+
|
|
503
|
+
############################################### Add basic read length, read quality, mapping quality stats ###############################################
|
|
504
|
+
from ..informatics.h5ad_functions import add_read_length_and_mapping_qc
|
|
505
|
+
from ..informatics.bam_functions import extract_read_features_from_bam
|
|
506
|
+
add_read_length_and_mapping_qc(raw_adata, se_bam_files,
|
|
507
|
+
extract_read_features_from_bam_callable=extract_read_features_from_bam,
|
|
508
|
+
bypass=cfg.bypass_add_read_length_and_mapping_qc,
|
|
509
|
+
force_redo=cfg.force_redo_add_read_length_and_mapping_qc)
|
|
510
|
+
|
|
511
|
+
raw_adata.obs['Raw_modification_signal'] = np.nansum(raw_adata.X, axis=1)
|
|
512
|
+
########################################################################################################################
|
|
513
|
+
|
|
514
|
+
############################################### Save final adata ###############################################
|
|
515
|
+
print(f"Saving AnnData to {raw_adata_path}")
|
|
516
|
+
write_gz_h5ad(raw_adata, raw_adata_path)
|
|
517
|
+
########################################################################################################################
|
|
518
|
+
|
|
519
|
+
############################################### MultiQC HTML Report ###############################################
|
|
520
|
+
from ..informatics.run_multiqc import run_multiqc
|
|
521
|
+
# multiqc ###
|
|
522
|
+
mqc_dir = cfg.split_path / "multiqc"
|
|
523
|
+
if mqc_dir.is_dir():
|
|
524
|
+
print(f'{mqc_dir} already exists, skipping multiqc')
|
|
525
|
+
else:
|
|
526
|
+
run_multiqc(cfg.split_path, mqc_dir)
|
|
527
|
+
########################################################################################################################
|
|
528
|
+
|
|
529
|
+
############################################### delete intermediate BAM files ###############################################
|
|
530
|
+
if cfg.delete_intermediate_bams:
|
|
531
|
+
# delete aligned and sorted bam
|
|
532
|
+
aligned_sorted_output.unlink()
|
|
533
|
+
bai = aligned_sorted_output.parent / (aligned_sorted_output.name + '.bai')
|
|
534
|
+
bai.unlink()
|
|
535
|
+
# delete the demultiplexed bams. Keep the demultiplexing summary files and directories to faciliate demultiplexing in the future with these files
|
|
536
|
+
for bam in bam_files:
|
|
537
|
+
bai = bam.parent / (bam.name + '.bai')
|
|
538
|
+
bam.unlink()
|
|
539
|
+
bai.unlink()
|
|
540
|
+
for bam in unclassified_bams:
|
|
541
|
+
bai = bam.parent / (bam.name + '.bai')
|
|
542
|
+
bam.unlink()
|
|
543
|
+
bai.unlink()
|
|
544
|
+
########################################################################################################################
|
|
545
|
+
|
|
546
|
+
return raw_adata, raw_adata_path, cfg
|
|
547
|
+
|
|
548
|
+
def load_adata(config_path: str):
|
|
549
|
+
"""
|
|
550
|
+
CLI-facing wrapper for the load pipeline.
|
|
551
|
+
|
|
552
|
+
- Reads config CSV into ExperimentConfig
|
|
553
|
+
- Computes canonical paths for all downstream AnnData stages
|
|
554
|
+
- Registers those in the summary CSV
|
|
555
|
+
- Applies stage-skipping logic (hmm > spatial > pp_dedup > pp > raw)
|
|
556
|
+
- If needed, calls the core pipeline to actually build the raw AnnData
|
|
557
|
+
|
|
558
|
+
Returns
|
|
559
|
+
-------
|
|
560
|
+
adata : anndata.AnnData | None
|
|
561
|
+
Newly created AnnData object, or None if we skipped because a later-stage
|
|
562
|
+
AnnData already exists.
|
|
563
|
+
adata_path : pathlib.Path
|
|
564
|
+
Path to the "current" AnnData that should be used downstream.
|
|
565
|
+
cfg : ExperimentConfig
|
|
566
|
+
Config object for downstream steps.
|
|
567
|
+
"""
|
|
568
|
+
from importlib import resources
|
|
569
|
+
from datetime import datetime
|
|
570
|
+
from pathlib import Path
|
|
571
|
+
|
|
572
|
+
import pandas as pd # used for summary file reading downstream if needed
|
|
573
|
+
|
|
574
|
+
from ..readwrite import make_dirs, add_or_update_column_in_csv
|
|
575
|
+
from ..config import LoadExperimentConfig, ExperimentConfig
|
|
576
|
+
|
|
577
|
+
from .helpers import get_adata_paths
|
|
578
|
+
|
|
579
|
+
date_str = datetime.today().strftime("%y%m%d")
|
|
580
|
+
|
|
581
|
+
# -----------------------------
|
|
582
|
+
# 1) Load config into cfg
|
|
583
|
+
# -----------------------------
|
|
584
|
+
loader = LoadExperimentConfig(config_path)
|
|
585
|
+
defaults_dir = resources.files("smftools").joinpath("config")
|
|
586
|
+
cfg, report = ExperimentConfig.from_var_dict(
|
|
587
|
+
loader.var_dict, date_str=date_str, defaults_dir=defaults_dir
|
|
588
|
+
)
|
|
589
|
+
|
|
590
|
+
# Ensure base output dir
|
|
591
|
+
make_dirs([cfg.output_directory])
|
|
592
|
+
|
|
593
|
+
# -----------------------------
|
|
594
|
+
# 2) Compute and register paths
|
|
595
|
+
# -----------------------------
|
|
596
|
+
paths = get_adata_paths(cfg)
|
|
597
|
+
|
|
598
|
+
# experiment-level metadata in summary CSV
|
|
599
|
+
add_or_update_column_in_csv(cfg.summary_file, "experiment_name", cfg.experiment_name)
|
|
600
|
+
add_or_update_column_in_csv(cfg.summary_file, "config_path", config_path)
|
|
601
|
+
add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
|
|
602
|
+
add_or_update_column_in_csv(cfg.summary_file, "input_files", [cfg.input_files])
|
|
603
|
+
|
|
604
|
+
# AnnData stage paths
|
|
605
|
+
add_or_update_column_in_csv(cfg.summary_file, "load_adata", paths.raw)
|
|
606
|
+
add_or_update_column_in_csv(cfg.summary_file, "pp_adata", paths.pp)
|
|
607
|
+
add_or_update_column_in_csv(cfg.summary_file, "pp_dedup_adata", paths.pp_dedup)
|
|
608
|
+
add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", paths.spatial)
|
|
609
|
+
add_or_update_column_in_csv(cfg.summary_file, "hmm_adata", paths.hmm)
|
|
610
|
+
|
|
611
|
+
# -----------------------------
|
|
612
|
+
# 3) Stage skipping logic
|
|
613
|
+
# -----------------------------
|
|
614
|
+
if not getattr(cfg, "force_redo_load_adata", False):
|
|
615
|
+
if paths.hmm.exists():
|
|
616
|
+
print(f"HMM AnnData already exists: {paths.hmm}\nSkipping smftools load")
|
|
617
|
+
return None, paths.hmm, cfg
|
|
618
|
+
if paths.spatial.exists():
|
|
619
|
+
print(f"Spatial AnnData already exists: {paths.spatial}\nSkipping smftools load")
|
|
620
|
+
return None, paths.spatial, cfg
|
|
621
|
+
if paths.pp_dedup.exists():
|
|
622
|
+
print(
|
|
623
|
+
f"Preprocessed deduplicated AnnData already exists: {paths.pp_dedup}\n"
|
|
624
|
+
f"Skipping smftools load"
|
|
625
|
+
)
|
|
626
|
+
return None, paths.pp_dedup, cfg
|
|
627
|
+
if paths.pp.exists():
|
|
628
|
+
print(f"Preprocessed AnnData already exists: {paths.pp}\nSkipping smftools load")
|
|
629
|
+
return None, paths.pp, cfg
|
|
630
|
+
if paths.raw.exists():
|
|
631
|
+
print(f"Raw AnnData from smftools load already exists: {paths.raw}\nSkipping smftools load")
|
|
632
|
+
return None, paths.raw, cfg
|
|
633
|
+
|
|
634
|
+
# If we get here, we actually want to run the full load pipeline
|
|
635
|
+
adata, adata_path, cfg = load_adata_core(cfg, paths)
|
|
636
|
+
|
|
637
|
+
return adata, adata_path, cfg
|