smftools 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/_version.py +1 -1
- smftools/cli/helpers.py +32 -6
- smftools/cli/hmm_adata.py +232 -31
- smftools/cli/latent_adata.py +318 -0
- smftools/cli/load_adata.py +77 -73
- smftools/cli/preprocess_adata.py +178 -53
- smftools/cli/spatial_adata.py +149 -101
- smftools/cli_entry.py +12 -0
- smftools/config/conversion.yaml +11 -1
- smftools/config/default.yaml +38 -1
- smftools/config/experiment_config.py +53 -1
- smftools/constants.py +65 -0
- smftools/hmm/HMM.py +88 -0
- smftools/informatics/__init__.py +6 -0
- smftools/informatics/bam_functions.py +358 -8
- smftools/informatics/converted_BAM_to_adata.py +584 -163
- smftools/informatics/h5ad_functions.py +115 -2
- smftools/informatics/modkit_extract_to_adata.py +1003 -425
- smftools/informatics/sequence_encoding.py +72 -0
- smftools/logging_utils.py +21 -2
- smftools/metadata.py +1 -1
- smftools/plotting/__init__.py +9 -0
- smftools/plotting/general_plotting.py +2411 -628
- smftools/plotting/hmm_plotting.py +85 -7
- smftools/preprocessing/__init__.py +1 -0
- smftools/preprocessing/append_base_context.py +17 -17
- smftools/preprocessing/append_mismatch_frequency_sites.py +158 -0
- smftools/preprocessing/calculate_consensus.py +1 -1
- smftools/preprocessing/calculate_read_modification_stats.py +6 -1
- smftools/readwrite.py +53 -17
- smftools/schema/anndata_schema_v1.yaml +15 -1
- smftools/tools/__init__.py +4 -0
- smftools/tools/calculate_leiden.py +57 -0
- smftools/tools/calculate_nmf.py +119 -0
- smftools/tools/calculate_umap.py +91 -8
- smftools/tools/rolling_nn_distance.py +235 -0
- smftools/tools/tensor_factorization.py +169 -0
- {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/METADATA +8 -6
- {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/RECORD +42 -35
- {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/WHEEL +0 -0
- {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/entry_points.txt +0 -0
- {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional, Tuple
|
|
6
|
+
|
|
7
|
+
import anndata as ad
|
|
8
|
+
|
|
9
|
+
from smftools.constants import LATENT_DIR, LOGGING_DIR, SEQUENCE_INTEGER_ENCODING
|
|
10
|
+
from smftools.logging_utils import get_logger, setup_logging
|
|
11
|
+
|
|
12
|
+
logger = get_logger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def latent_adata(
|
|
16
|
+
config_path: str,
|
|
17
|
+
) -> Tuple[Optional[ad.AnnData], Optional[Path]]:
|
|
18
|
+
"""
|
|
19
|
+
CLI-facing wrapper for representation learning.
|
|
20
|
+
|
|
21
|
+
Called by: `smftools latent <config_path>`
|
|
22
|
+
|
|
23
|
+
Responsibilities:
|
|
24
|
+
- Determine which AnnData stages exist (pp, pp_dedup, spatial, hmm).
|
|
25
|
+
- Call `latent_adata_core(...)` when actual work is needed.
|
|
26
|
+
|
|
27
|
+
Returns
|
|
28
|
+
-------
|
|
29
|
+
latent_adata : AnnData | None
|
|
30
|
+
AnnData with latent analyses, or None if we skipped because a later-stage
|
|
31
|
+
AnnData already exists.
|
|
32
|
+
latent_adata_path : Path | None
|
|
33
|
+
Path to the “current” latent AnnData.
|
|
34
|
+
"""
|
|
35
|
+
from ..readwrite import add_or_update_column_in_csv, safe_read_h5ad
|
|
36
|
+
from .helpers import get_adata_paths, load_experiment_config
|
|
37
|
+
|
|
38
|
+
# 1) Ensure config + basic paths via load_adata
|
|
39
|
+
cfg = load_experiment_config(config_path)
|
|
40
|
+
|
|
41
|
+
paths = get_adata_paths(cfg)
|
|
42
|
+
|
|
43
|
+
pp_path = paths.pp
|
|
44
|
+
pp_dedup_path = paths.pp_dedup
|
|
45
|
+
spatial_path = paths.spatial
|
|
46
|
+
hmm_path = paths.hmm
|
|
47
|
+
latent_path = paths.latent
|
|
48
|
+
|
|
49
|
+
# Stage-skipping logic for latent
|
|
50
|
+
if not getattr(cfg, "force_redo_latent_analyses", False):
|
|
51
|
+
# If latent exists, we consider latent analyses already done.
|
|
52
|
+
if latent_path.exists():
|
|
53
|
+
logger.info(f"Latent AnnData found: {latent_path}\nSkipping smftools latent")
|
|
54
|
+
return None, latent_path
|
|
55
|
+
|
|
56
|
+
# Helper to load from disk, reusing loaded_adata if it matches
|
|
57
|
+
def _load(path: Path):
|
|
58
|
+
adata, _ = safe_read_h5ad(path)
|
|
59
|
+
return adata
|
|
60
|
+
|
|
61
|
+
# 3) Decide which AnnData to use as the *starting point* for latent analyses
|
|
62
|
+
if latent_path.exists():
|
|
63
|
+
start_adata = _load(latent_path)
|
|
64
|
+
source_path = latent_path
|
|
65
|
+
elif hmm_path.exists():
|
|
66
|
+
start_adata = _load(hmm_path)
|
|
67
|
+
source_path = hmm_path
|
|
68
|
+
elif spatial_path.exists():
|
|
69
|
+
start_adata = _load(spatial_path)
|
|
70
|
+
source_path = spatial_path
|
|
71
|
+
elif pp_dedup_path.exists():
|
|
72
|
+
start_adata = _load(pp_dedup_path)
|
|
73
|
+
source_path = pp_dedup_path
|
|
74
|
+
elif pp_path.exists():
|
|
75
|
+
start_adata = _load(pp_path)
|
|
76
|
+
source_path = pp_path
|
|
77
|
+
else:
|
|
78
|
+
logger.warning(
|
|
79
|
+
"No suitable AnnData found for latent analyses (need at least preprocessed)."
|
|
80
|
+
)
|
|
81
|
+
return None, None
|
|
82
|
+
|
|
83
|
+
# 4) Run the latent core
|
|
84
|
+
adata_latent, latent_path = latent_adata_core(
|
|
85
|
+
adata=start_adata,
|
|
86
|
+
cfg=cfg,
|
|
87
|
+
paths=paths,
|
|
88
|
+
source_adata_path=source_path,
|
|
89
|
+
config_path=config_path,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
return adata_latent, latent_path
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def latent_adata_core(
|
|
96
|
+
adata: ad.AnnData,
|
|
97
|
+
cfg,
|
|
98
|
+
paths: AdataPaths,
|
|
99
|
+
source_adata_path: Optional[Path] = None,
|
|
100
|
+
config_path: Optional[str] = None,
|
|
101
|
+
) -> Tuple[ad.AnnData, Path]:
|
|
102
|
+
"""
|
|
103
|
+
Core spatial analysis pipeline.
|
|
104
|
+
|
|
105
|
+
Assumes:
|
|
106
|
+
- `adata` is (typically) the preprocessed, duplicate-removed AnnData.
|
|
107
|
+
- `cfg` is the ExperimentConfig.
|
|
108
|
+
|
|
109
|
+
Does:
|
|
110
|
+
- Optional sample sheet load.
|
|
111
|
+
- Optional inversion & reindexing.
|
|
112
|
+
- PCA/UMAP/Leiden.
|
|
113
|
+
- Save latent AnnData to `latent_adata_path`.
|
|
114
|
+
|
|
115
|
+
Returns
|
|
116
|
+
-------
|
|
117
|
+
adata : AnnData
|
|
118
|
+
analyzed AnnData (same object, modified in-place).
|
|
119
|
+
adata_path : Path
|
|
120
|
+
Path where AnnData was written.
|
|
121
|
+
"""
|
|
122
|
+
import os
|
|
123
|
+
import warnings
|
|
124
|
+
from datetime import datetime
|
|
125
|
+
from pathlib import Path
|
|
126
|
+
|
|
127
|
+
import numpy as np
|
|
128
|
+
import pandas as pd
|
|
129
|
+
|
|
130
|
+
from ..metadata import record_smftools_metadata
|
|
131
|
+
from ..plotting import (
|
|
132
|
+
plot_cp_sequence_components,
|
|
133
|
+
plot_embedding,
|
|
134
|
+
plot_nmf_components,
|
|
135
|
+
plot_pca,
|
|
136
|
+
plot_umap,
|
|
137
|
+
)
|
|
138
|
+
from ..preprocessing import (
|
|
139
|
+
invert_adata,
|
|
140
|
+
load_sample_sheet,
|
|
141
|
+
reindex_references_adata,
|
|
142
|
+
)
|
|
143
|
+
from ..readwrite import make_dirs, safe_read_h5ad
|
|
144
|
+
from ..tools import (
|
|
145
|
+
calculate_leiden,
|
|
146
|
+
calculate_nmf,
|
|
147
|
+
calculate_sequence_cp_decomposition,
|
|
148
|
+
calculate_umap,
|
|
149
|
+
)
|
|
150
|
+
from .helpers import write_gz_h5ad
|
|
151
|
+
|
|
152
|
+
# -----------------------------
|
|
153
|
+
# General setup
|
|
154
|
+
# -----------------------------
|
|
155
|
+
date_str = datetime.today().strftime("%y%m%d")
|
|
156
|
+
now = datetime.now()
|
|
157
|
+
time_str = now.strftime("%H%M%S")
|
|
158
|
+
log_level = getattr(logging, cfg.log_level.upper(), logging.INFO)
|
|
159
|
+
|
|
160
|
+
latent_adata_path = paths.latent
|
|
161
|
+
|
|
162
|
+
output_directory = Path(cfg.output_directory)
|
|
163
|
+
latent_directory = output_directory / LATENT_DIR
|
|
164
|
+
logging_directory = latent_directory / LOGGING_DIR
|
|
165
|
+
|
|
166
|
+
make_dirs([output_directory, latent_directory])
|
|
167
|
+
|
|
168
|
+
if cfg.emit_log_file:
|
|
169
|
+
log_file = logging_directory / f"{date_str}_{time_str}_log.log"
|
|
170
|
+
make_dirs([logging_directory])
|
|
171
|
+
else:
|
|
172
|
+
log_file = None
|
|
173
|
+
|
|
174
|
+
setup_logging(level=log_level, log_file=log_file, reconfigure=log_file is not None)
|
|
175
|
+
|
|
176
|
+
smf_modality = cfg.smf_modality
|
|
177
|
+
if smf_modality == "conversion":
|
|
178
|
+
deaminase = False
|
|
179
|
+
else:
|
|
180
|
+
deaminase = True
|
|
181
|
+
|
|
182
|
+
# -----------------------------
|
|
183
|
+
# Optional sample sheet metadata
|
|
184
|
+
# -----------------------------
|
|
185
|
+
if getattr(cfg, "sample_sheet_path", None):
|
|
186
|
+
load_sample_sheet(
|
|
187
|
+
adata,
|
|
188
|
+
cfg.sample_sheet_path,
|
|
189
|
+
mapping_key_column=cfg.sample_sheet_mapping_column,
|
|
190
|
+
as_category=True,
|
|
191
|
+
force_reload=cfg.force_reload_sample_sheet,
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
# -----------------------------
|
|
195
|
+
# Optional inversion along positions axis
|
|
196
|
+
# -----------------------------
|
|
197
|
+
if getattr(cfg, "invert_adata", False):
|
|
198
|
+
adata = invert_adata(adata)
|
|
199
|
+
|
|
200
|
+
# -----------------------------
|
|
201
|
+
# Optional reindexing by reference
|
|
202
|
+
# -----------------------------
|
|
203
|
+
reindex_references_adata(
|
|
204
|
+
adata,
|
|
205
|
+
reference_col=cfg.reference_column,
|
|
206
|
+
offsets=cfg.reindexing_offsets,
|
|
207
|
+
new_col=cfg.reindexed_var_suffix,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
if adata.uns.get("reindex_references_adata_performed", False):
|
|
211
|
+
reindex_suffix = cfg.reindexed_var_suffix
|
|
212
|
+
else:
|
|
213
|
+
reindex_suffix = None
|
|
214
|
+
|
|
215
|
+
references = adata.obs[cfg.reference_column].cat.categories
|
|
216
|
+
|
|
217
|
+
# ============================================================
|
|
218
|
+
# 2) PCA/UMAP on *deduplicated* preprocessed AnnData
|
|
219
|
+
# ============================================================
|
|
220
|
+
latent_dir_dedup = latent_directory / "deduplicated"
|
|
221
|
+
umap_dir = latent_dir_dedup / "07_umaps"
|
|
222
|
+
nmf_dir = latent_dir_dedup / "07b_nmf"
|
|
223
|
+
nmf_sequence_dir = latent_dir_dedup / "07c_nmf_sequence"
|
|
224
|
+
|
|
225
|
+
var_filters = []
|
|
226
|
+
if smf_modality == "direct":
|
|
227
|
+
for ref in references:
|
|
228
|
+
for base in cfg.mod_target_bases:
|
|
229
|
+
var_filters.append(f"{ref}_{base}_site")
|
|
230
|
+
elif deaminase:
|
|
231
|
+
for ref in references:
|
|
232
|
+
var_filters.append(f"{ref}_C_site")
|
|
233
|
+
else:
|
|
234
|
+
for ref in references:
|
|
235
|
+
for base in cfg.mod_target_bases:
|
|
236
|
+
var_filters.append(f"{ref}_{base}_site")
|
|
237
|
+
|
|
238
|
+
# UMAP / Leiden
|
|
239
|
+
if umap_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
240
|
+
logger.debug(f"{umap_dir} already exists. Skipping UMAP plotting.")
|
|
241
|
+
else:
|
|
242
|
+
make_dirs([umap_dir])
|
|
243
|
+
|
|
244
|
+
adata = calculate_umap(
|
|
245
|
+
adata,
|
|
246
|
+
layer=cfg.layer_for_umap_plotting,
|
|
247
|
+
var_filters=var_filters,
|
|
248
|
+
n_pcs=10,
|
|
249
|
+
knn_neighbors=15,
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
calculate_leiden(adata, resolution=0.1)
|
|
253
|
+
|
|
254
|
+
umap_layers = ["leiden", cfg.sample_name_col_for_plotting, "Reference_strand"]
|
|
255
|
+
umap_layers += cfg.umap_layers_to_plot
|
|
256
|
+
plot_umap(adata, color=umap_layers, output_dir=umap_dir)
|
|
257
|
+
plot_pca(adata, color=umap_layers, output_dir=umap_dir)
|
|
258
|
+
|
|
259
|
+
# NMF
|
|
260
|
+
if nmf_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
261
|
+
logger.debug(f"{nmf_dir} already exists. Skipping NMF plotting.")
|
|
262
|
+
else:
|
|
263
|
+
make_dirs([nmf_dir])
|
|
264
|
+
adata = calculate_nmf(
|
|
265
|
+
adata,
|
|
266
|
+
layer=cfg.layer_for_umap_plotting,
|
|
267
|
+
var_filters=var_filters,
|
|
268
|
+
n_components=5,
|
|
269
|
+
)
|
|
270
|
+
nmf_layers = ["leiden", cfg.sample_name_col_for_plotting, "Reference_strand"]
|
|
271
|
+
nmf_layers += cfg.umap_layers_to_plot
|
|
272
|
+
plot_embedding(adata, basis="nmf", color=nmf_layers, output_dir=nmf_dir)
|
|
273
|
+
plot_nmf_components(adata, output_dir=nmf_dir)
|
|
274
|
+
|
|
275
|
+
# CP decomposition using sequence integer encoding (no var filters)
|
|
276
|
+
if nmf_sequence_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
277
|
+
logger.debug(f"{nmf_sequence_dir} already exists. Skipping sequence CP plotting.")
|
|
278
|
+
elif SEQUENCE_INTEGER_ENCODING not in adata.layers:
|
|
279
|
+
logger.warning(
|
|
280
|
+
"Layer %s not found; skipping sequence integer encoding CP.",
|
|
281
|
+
SEQUENCE_INTEGER_ENCODING,
|
|
282
|
+
)
|
|
283
|
+
else:
|
|
284
|
+
make_dirs([nmf_sequence_dir])
|
|
285
|
+
adata = calculate_sequence_cp_decomposition(
|
|
286
|
+
adata,
|
|
287
|
+
layer=SEQUENCE_INTEGER_ENCODING,
|
|
288
|
+
rank=5,
|
|
289
|
+
embedding_key="X_cp_sequence",
|
|
290
|
+
components_key="H_cp_sequence",
|
|
291
|
+
uns_key="cp_sequence",
|
|
292
|
+
)
|
|
293
|
+
nmf_layers = ["leiden", cfg.sample_name_col_for_plotting, "Reference_strand"]
|
|
294
|
+
nmf_layers += cfg.umap_layers_to_plot
|
|
295
|
+
plot_embedding(adata, basis="cp_sequence", color=nmf_layers, output_dir=nmf_sequence_dir)
|
|
296
|
+
plot_cp_sequence_components(
|
|
297
|
+
adata,
|
|
298
|
+
output_dir=nmf_sequence_dir,
|
|
299
|
+
components_key="H_cp_sequence",
|
|
300
|
+
uns_key="cp_sequence",
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
# ============================================================
|
|
304
|
+
# 5) Save latent AnnData
|
|
305
|
+
# ============================================================
|
|
306
|
+
if (not latent_adata_path.exists()) or getattr(cfg, "force_redo_latent_analyses", False):
|
|
307
|
+
logger.info("Saving latent analyzed AnnData (post preprocessing and duplicate removal).")
|
|
308
|
+
record_smftools_metadata(
|
|
309
|
+
adata,
|
|
310
|
+
step_name="latent",
|
|
311
|
+
cfg=cfg,
|
|
312
|
+
config_path=config_path,
|
|
313
|
+
input_paths=[source_adata_path] if source_adata_path else None,
|
|
314
|
+
output_path=latent_adata_path,
|
|
315
|
+
)
|
|
316
|
+
write_gz_h5ad(adata, latent_adata_path)
|
|
317
|
+
|
|
318
|
+
return adata, latent_adata_path
|
smftools/cli/load_adata.py
CHANGED
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import logging
|
|
3
4
|
import shutil
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from typing import Iterable, Union
|
|
6
7
|
|
|
7
8
|
import numpy as np
|
|
8
9
|
|
|
9
|
-
from smftools.
|
|
10
|
+
from smftools.constants import HMM_DIR, LOAD_DIR, LOGGING_DIR, PREPROCESS_DIR, SPATIAL_DIR
|
|
11
|
+
from smftools.logging_utils import get_logger, setup_logging
|
|
10
12
|
|
|
11
13
|
from .helpers import AdataPaths
|
|
12
14
|
|
|
@@ -103,63 +105,29 @@ def load_adata(config_path: str):
|
|
|
103
105
|
from datetime import datetime
|
|
104
106
|
from importlib import resources
|
|
105
107
|
|
|
106
|
-
from ..config import ExperimentConfig, LoadExperimentConfig
|
|
107
108
|
from ..readwrite import add_or_update_column_in_csv, make_dirs
|
|
108
|
-
from .helpers import get_adata_paths
|
|
109
|
-
|
|
110
|
-
date_str = datetime.today().strftime("%y%m%d")
|
|
109
|
+
from .helpers import get_adata_paths, load_experiment_config
|
|
111
110
|
|
|
112
111
|
# -----------------------------
|
|
113
112
|
# 1) Load config into cfg
|
|
114
113
|
# -----------------------------
|
|
115
|
-
|
|
116
|
-
defaults_dir = resources.files("smftools").joinpath("config")
|
|
117
|
-
cfg, report = ExperimentConfig.from_var_dict(
|
|
118
|
-
loader.var_dict, date_str=date_str, defaults_dir=defaults_dir
|
|
119
|
-
)
|
|
114
|
+
cfg = load_experiment_config(config_path)
|
|
120
115
|
|
|
121
116
|
# Ensure base output dir
|
|
122
|
-
|
|
117
|
+
output_directory = Path(cfg.output_directory)
|
|
118
|
+
make_dirs([output_directory])
|
|
123
119
|
|
|
124
120
|
# -----------------------------
|
|
125
121
|
# 2) Compute and register paths
|
|
126
122
|
# -----------------------------
|
|
127
123
|
paths = get_adata_paths(cfg)
|
|
128
124
|
|
|
129
|
-
# experiment-level metadata in summary CSV
|
|
130
|
-
add_or_update_column_in_csv(cfg.summary_file, "experiment_name", cfg.experiment_name)
|
|
131
|
-
add_or_update_column_in_csv(cfg.summary_file, "config_path", config_path)
|
|
132
|
-
add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
|
|
133
|
-
add_or_update_column_in_csv(cfg.summary_file, "input_files", [cfg.input_files])
|
|
134
|
-
|
|
135
|
-
# AnnData stage paths
|
|
136
|
-
add_or_update_column_in_csv(cfg.summary_file, "load_adata", paths.raw)
|
|
137
|
-
add_or_update_column_in_csv(cfg.summary_file, "pp_adata", paths.pp)
|
|
138
|
-
add_or_update_column_in_csv(cfg.summary_file, "pp_dedup_adata", paths.pp_dedup)
|
|
139
|
-
add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", paths.spatial)
|
|
140
|
-
add_or_update_column_in_csv(cfg.summary_file, "hmm_adata", paths.hmm)
|
|
141
|
-
|
|
142
125
|
# -----------------------------
|
|
143
126
|
# 3) Stage skipping logic
|
|
144
127
|
# -----------------------------
|
|
145
128
|
if not getattr(cfg, "force_redo_load_adata", False):
|
|
146
|
-
if paths.hmm.exists():
|
|
147
|
-
logger.debug(f"HMM AnnData already exists: {paths.hmm}\nSkipping smftools load")
|
|
148
|
-
return None, paths.hmm, cfg
|
|
149
|
-
if paths.spatial.exists():
|
|
150
|
-
logger.debug(f"Spatial AnnData already exists: {paths.spatial}\nSkipping smftools load")
|
|
151
|
-
return None, paths.spatial, cfg
|
|
152
|
-
if paths.pp_dedup.exists():
|
|
153
|
-
logger.debug(
|
|
154
|
-
f"Preprocessed deduplicated AnnData already exists: {paths.pp_dedup}\n"
|
|
155
|
-
f"Skipping smftools load"
|
|
156
|
-
)
|
|
157
|
-
return None, paths.pp_dedup, cfg
|
|
158
|
-
if paths.pp.exists():
|
|
159
|
-
logger.debug(f"Preprocessed AnnData already exists: {paths.pp}\nSkipping smftools load")
|
|
160
|
-
return None, paths.pp, cfg
|
|
161
129
|
if paths.raw.exists():
|
|
162
|
-
logger.
|
|
130
|
+
logger.info(
|
|
163
131
|
f"Raw AnnData from smftools load already exists: {paths.raw}\nSkipping smftools load"
|
|
164
132
|
)
|
|
165
133
|
return None, paths.raw, cfg
|
|
@@ -199,6 +167,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
199
167
|
cfg : ExperimentConfig
|
|
200
168
|
(Same object, possibly with some fields updated, e.g. fasta path.)
|
|
201
169
|
"""
|
|
170
|
+
from datetime import datetime
|
|
202
171
|
|
|
203
172
|
from ..informatics.bam_functions import (
|
|
204
173
|
align_and_sort_BAM,
|
|
@@ -206,6 +175,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
206
175
|
concatenate_fastqs_to_bam,
|
|
207
176
|
demux_and_index_BAM,
|
|
208
177
|
extract_read_features_from_bam,
|
|
178
|
+
extract_read_tags_from_bam,
|
|
209
179
|
split_and_index_BAM,
|
|
210
180
|
)
|
|
211
181
|
from ..informatics.basecalling import canoncall, modcall
|
|
@@ -216,7 +186,11 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
216
186
|
get_chromosome_lengths,
|
|
217
187
|
subsample_fasta_from_bed,
|
|
218
188
|
)
|
|
219
|
-
from ..informatics.h5ad_functions import
|
|
189
|
+
from ..informatics.h5ad_functions import (
|
|
190
|
+
add_read_length_and_mapping_qc,
|
|
191
|
+
add_read_tag_annotations,
|
|
192
|
+
add_secondary_supplementary_alignment_flags,
|
|
193
|
+
)
|
|
220
194
|
from ..informatics.modkit_extract_to_adata import modkit_extract_to_adata
|
|
221
195
|
from ..informatics.modkit_functions import extract_mods, make_modbed, modQC
|
|
222
196
|
from ..informatics.pod5_functions import fast5_to_pod5
|
|
@@ -226,8 +200,25 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
226
200
|
from .helpers import write_gz_h5ad
|
|
227
201
|
|
|
228
202
|
################################### 1) General params and input organization ###################################
|
|
203
|
+
date_str = datetime.today().strftime("%y%m%d")
|
|
204
|
+
now = datetime.now()
|
|
205
|
+
time_str = now.strftime("%H%M%S")
|
|
206
|
+
|
|
207
|
+
log_level = getattr(logging, cfg.log_level.upper(), logging.INFO)
|
|
208
|
+
|
|
229
209
|
output_directory = Path(cfg.output_directory)
|
|
230
|
-
|
|
210
|
+
load_directory = output_directory / LOAD_DIR
|
|
211
|
+
logging_directory = load_directory / LOGGING_DIR
|
|
212
|
+
|
|
213
|
+
make_dirs([output_directory, load_directory])
|
|
214
|
+
|
|
215
|
+
if cfg.emit_log_file:
|
|
216
|
+
log_file = logging_directory / f"{date_str}_{time_str}_log.log"
|
|
217
|
+
make_dirs([logging_directory])
|
|
218
|
+
else:
|
|
219
|
+
log_file = None
|
|
220
|
+
|
|
221
|
+
setup_logging(level=log_level, log_file=log_file, reconfigure=log_file is not None)
|
|
231
222
|
|
|
232
223
|
raw_adata_path = paths.raw
|
|
233
224
|
pp_adata_path = paths.pp
|
|
@@ -241,11 +232,9 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
241
232
|
|
|
242
233
|
# Direct methylation detection SMF specific parameters
|
|
243
234
|
if cfg.smf_modality == "direct":
|
|
244
|
-
mod_bed_dir =
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
add_or_update_column_in_csv(cfg.summary_file, "mod_tsv_dir", mod_tsv_dir)
|
|
248
|
-
bam_qc_dir = cfg.output_directory / "bam_qc"
|
|
235
|
+
mod_bed_dir = load_directory / "mod_beds"
|
|
236
|
+
mod_tsv_dir = load_directory / "mod_tsvs"
|
|
237
|
+
bam_qc_dir = load_directory / "bam_qc"
|
|
249
238
|
mods = [cfg.mod_map[mod] for mod in cfg.mod_list]
|
|
250
239
|
|
|
251
240
|
if not check_executable_exists("dorado"):
|
|
@@ -281,7 +270,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
281
270
|
# If the input files are fast5 files, convert the files to a pod5 file before proceeding.
|
|
282
271
|
if cfg.input_type == "fast5":
|
|
283
272
|
# take the input directory of fast5 files and write out a single pod5 file into the output directory.
|
|
284
|
-
output_pod5 =
|
|
273
|
+
output_pod5 = load_directory / "FAST5s_to_POD5.pod5"
|
|
285
274
|
if output_pod5.exists():
|
|
286
275
|
pass
|
|
287
276
|
else:
|
|
@@ -295,7 +284,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
295
284
|
# If the input is a fastq or a directory of fastqs, concatenate them into an unaligned BAM and save the barcode
|
|
296
285
|
elif cfg.input_type == "fastq":
|
|
297
286
|
# Output file for FASTQ concatenation.
|
|
298
|
-
output_bam =
|
|
287
|
+
output_bam = load_directory / "canonical_basecalls.bam"
|
|
299
288
|
if output_bam.exists():
|
|
300
289
|
logger.debug("Output BAM already exists")
|
|
301
290
|
else:
|
|
@@ -323,8 +312,6 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
323
312
|
else:
|
|
324
313
|
pass
|
|
325
314
|
|
|
326
|
-
add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
|
|
327
|
-
|
|
328
315
|
# Determine if the input data needs to be basecalled
|
|
329
316
|
if cfg.input_type == "pod5":
|
|
330
317
|
logger.info(f"Detected pod5 inputs: {cfg.input_files}")
|
|
@@ -341,25 +328,24 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
341
328
|
model_basename = str(model_basename).replace(".", "_")
|
|
342
329
|
if cfg.smf_modality == "direct":
|
|
343
330
|
mod_string = "_".join(cfg.mod_list)
|
|
344
|
-
bam =
|
|
331
|
+
bam = load_directory / f"{model_basename}_{mod_string}_calls"
|
|
345
332
|
else:
|
|
346
|
-
bam =
|
|
333
|
+
bam = load_directory / f"{model_basename}_canonical_basecalls"
|
|
347
334
|
else:
|
|
348
|
-
bam_base = cfg.input_data_path.
|
|
349
|
-
bam = cfg.
|
|
335
|
+
bam_base = cfg.input_data_path.stem
|
|
336
|
+
bam = cfg.input_data_path.parent / bam_base
|
|
350
337
|
|
|
351
338
|
# Generate path names for the unaligned, aligned, as well as the aligned/sorted bam.
|
|
352
339
|
unaligned_output = bam.with_suffix(cfg.bam_suffix)
|
|
340
|
+
|
|
353
341
|
aligned_BAM = (
|
|
354
|
-
|
|
342
|
+
load_directory / (bam.stem + "_aligned")
|
|
355
343
|
) # doing this allows specifying an input bam in a seperate directory as the aligned output bams
|
|
344
|
+
|
|
356
345
|
aligned_output = aligned_BAM.with_suffix(cfg.bam_suffix)
|
|
357
346
|
aligned_sorted_BAM = aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
|
|
358
347
|
aligned_sorted_output = aligned_sorted_BAM.with_suffix(cfg.bam_suffix)
|
|
359
348
|
|
|
360
|
-
add_or_update_column_in_csv(cfg.summary_file, "basecalled_bam", unaligned_output)
|
|
361
|
-
add_or_update_column_in_csv(cfg.summary_file, "aligned_bam", aligned_output)
|
|
362
|
-
add_or_update_column_in_csv(cfg.summary_file, "sorted_bam", aligned_sorted_output)
|
|
363
349
|
########################################################################################################################
|
|
364
350
|
|
|
365
351
|
################################### 2) FASTA Handling ###################################
|
|
@@ -373,11 +359,11 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
373
359
|
if cfg.fasta_regions_of_interest and ".bed" in cfg.fasta_regions_of_interest:
|
|
374
360
|
fasta_stem = cfg.fasta.stem
|
|
375
361
|
bed_stem = Path(cfg.fasta_regions_of_interest).stem
|
|
376
|
-
output_FASTA =
|
|
362
|
+
output_FASTA = load_directory / f"{fasta_stem}_subsampled_by_{bed_stem}.fasta"
|
|
377
363
|
|
|
378
364
|
logger.info("Subsampling FASTA records using the provided BED file")
|
|
379
365
|
subsample_fasta_from_bed(
|
|
380
|
-
cfg.fasta, cfg.fasta_regions_of_interest,
|
|
366
|
+
cfg.fasta, cfg.fasta_regions_of_interest, load_directory, output_FASTA
|
|
381
367
|
)
|
|
382
368
|
fasta = output_FASTA
|
|
383
369
|
else:
|
|
@@ -388,7 +374,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
388
374
|
if cfg.smf_modality == "conversion":
|
|
389
375
|
fasta_stem = fasta.stem
|
|
390
376
|
converted_FASTA_basename = f"{fasta_stem}_converted.fasta"
|
|
391
|
-
converted_FASTA =
|
|
377
|
+
converted_FASTA = load_directory / converted_FASTA_basename
|
|
392
378
|
|
|
393
379
|
if "converted.fa" in fasta.name:
|
|
394
380
|
logger.info(f"{fasta} is already converted. Using existing converted FASTA.")
|
|
@@ -400,8 +386,6 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
400
386
|
generate_converted_FASTA(fasta, cfg.conversion_types, cfg.strands, converted_FASTA)
|
|
401
387
|
fasta = converted_FASTA
|
|
402
388
|
|
|
403
|
-
add_or_update_column_in_csv(cfg.summary_file, "fasta", fasta)
|
|
404
|
-
|
|
405
389
|
# Make a FAI and .chrom.names file for the fasta
|
|
406
390
|
get_chromosome_lengths(fasta)
|
|
407
391
|
########################################################################################################################
|
|
@@ -462,13 +446,13 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
462
446
|
logger.debug(f"{aligned_sorted_output} already exists. Using existing aligned/sorted BAM.")
|
|
463
447
|
else:
|
|
464
448
|
logger.info(f"Aligning and sorting reads")
|
|
465
|
-
align_and_sort_BAM(fasta, unaligned_output, cfg)
|
|
449
|
+
align_and_sort_BAM(fasta, unaligned_output, aligned_output, cfg)
|
|
466
450
|
# Deleted the unsorted aligned output
|
|
467
451
|
aligned_output.unlink()
|
|
468
452
|
|
|
469
453
|
if cfg.make_beds:
|
|
470
454
|
# Make beds and provide basic histograms
|
|
471
|
-
bed_dir =
|
|
455
|
+
bed_dir = load_directory / "beds"
|
|
472
456
|
if bed_dir.is_dir():
|
|
473
457
|
logger.debug(
|
|
474
458
|
f"{bed_dir} already exists. Skipping BAM -> BED conversion for {aligned_sorted_output}"
|
|
@@ -477,7 +461,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
477
461
|
logger.info("Making bed files from the aligned and sorted BAM file")
|
|
478
462
|
aligned_BAM_to_bed(
|
|
479
463
|
aligned_sorted_output,
|
|
480
|
-
|
|
464
|
+
load_directory,
|
|
481
465
|
fasta,
|
|
482
466
|
cfg.make_bigwigs,
|
|
483
467
|
cfg.threads,
|
|
@@ -515,6 +499,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
515
499
|
|
|
516
500
|
se_bam_files = bam_files
|
|
517
501
|
bam_dir = cfg.split_path
|
|
502
|
+
double_barcoded_path = None
|
|
518
503
|
|
|
519
504
|
else:
|
|
520
505
|
if single_barcoded_path.is_dir():
|
|
@@ -608,7 +593,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
608
593
|
################################### 6) SAMTools based BAM QC ######################################################################
|
|
609
594
|
|
|
610
595
|
# 5) Samtools QC metrics on split BAM files
|
|
611
|
-
bam_qc_dir =
|
|
596
|
+
bam_qc_dir = load_directory / "bam_qc"
|
|
612
597
|
if bam_qc_dir.is_dir():
|
|
613
598
|
logger.debug(f"{bam_qc_dir} already exists. Using existing BAM QC calculations.")
|
|
614
599
|
else:
|
|
@@ -637,7 +622,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
637
622
|
raw_adata, raw_adata_path = converted_BAM_to_adata(
|
|
638
623
|
fasta,
|
|
639
624
|
bam_dir,
|
|
640
|
-
|
|
625
|
+
load_directory,
|
|
641
626
|
cfg.input_already_demuxed,
|
|
642
627
|
cfg.mapping_threshold,
|
|
643
628
|
cfg.experiment_name,
|
|
@@ -694,7 +679,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
694
679
|
raw_adata, raw_adata_path = modkit_extract_to_adata(
|
|
695
680
|
fasta,
|
|
696
681
|
bam_dir,
|
|
697
|
-
|
|
682
|
+
load_directory,
|
|
698
683
|
cfg.input_already_demuxed,
|
|
699
684
|
cfg.mapping_threshold,
|
|
700
685
|
cfg.experiment_name,
|
|
@@ -728,6 +713,25 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
728
713
|
samtools_backend=cfg.samtools_backend,
|
|
729
714
|
)
|
|
730
715
|
|
|
716
|
+
logger.info("Adding BAM tags and BAM flags to adata.obs")
|
|
717
|
+
add_read_tag_annotations(
|
|
718
|
+
raw_adata,
|
|
719
|
+
se_bam_files,
|
|
720
|
+
tag_names=getattr(cfg, "bam_tag_names", ["NM", "MD", "MM", "ML"]),
|
|
721
|
+
include_flags=True,
|
|
722
|
+
include_cigar=True,
|
|
723
|
+
extract_read_tags_from_bam_callable=extract_read_tags_from_bam,
|
|
724
|
+
samtools_backend=cfg.samtools_backend,
|
|
725
|
+
)
|
|
726
|
+
|
|
727
|
+
if getattr(cfg, "annotate_secondary_supplementary", False):
|
|
728
|
+
logger.info("Annotating secondary/supplementary alignments from aligned BAM")
|
|
729
|
+
add_secondary_supplementary_alignment_flags(
|
|
730
|
+
raw_adata,
|
|
731
|
+
aligned_sorted_output,
|
|
732
|
+
samtools_backend=cfg.samtools_backend,
|
|
733
|
+
)
|
|
734
|
+
|
|
731
735
|
raw_adata.obs["Raw_modification_signal"] = np.nansum(raw_adata.X, axis=1)
|
|
732
736
|
########################################################################################################################
|
|
733
737
|
|
|
@@ -740,7 +744,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
740
744
|
raw_adata,
|
|
741
745
|
cfg.input_data_path,
|
|
742
746
|
n_jobs=cfg.threads,
|
|
743
|
-
csv_path=
|
|
747
|
+
csv_path=load_directory / "read_to_pod5_origin_mapping.csv",
|
|
744
748
|
)
|
|
745
749
|
########################################################################################################################
|
|
746
750
|
|
|
@@ -759,12 +763,12 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
|
|
|
759
763
|
############################################### MultiQC HTML Report ###############################################
|
|
760
764
|
|
|
761
765
|
# multiqc ###
|
|
762
|
-
mqc_dir =
|
|
766
|
+
mqc_dir = load_directory / "multiqc"
|
|
763
767
|
if mqc_dir.is_dir():
|
|
764
768
|
logger.info(f"{mqc_dir} already exists, skipping multiqc")
|
|
765
769
|
else:
|
|
766
770
|
logger.info("Running multiqc")
|
|
767
|
-
run_multiqc(
|
|
771
|
+
run_multiqc(bam_qc_dir, mqc_dir)
|
|
768
772
|
########################################################################################################################
|
|
769
773
|
|
|
770
774
|
############################################### delete intermediate BAM files ###############################################
|