smftools 0.2.5__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +39 -7
- smftools/_settings.py +2 -0
- smftools/_version.py +3 -1
- smftools/cli/__init__.py +1 -0
- smftools/cli/archived/cli_flows.py +2 -0
- smftools/cli/helpers.py +34 -6
- smftools/cli/hmm_adata.py +239 -33
- smftools/cli/latent_adata.py +318 -0
- smftools/cli/load_adata.py +167 -131
- smftools/cli/preprocess_adata.py +180 -53
- smftools/cli/spatial_adata.py +152 -100
- smftools/cli_entry.py +38 -1
- smftools/config/__init__.py +2 -0
- smftools/config/conversion.yaml +11 -1
- smftools/config/default.yaml +42 -2
- smftools/config/experiment_config.py +59 -1
- smftools/constants.py +65 -0
- smftools/datasets/__init__.py +2 -0
- smftools/hmm/HMM.py +97 -3
- smftools/hmm/__init__.py +24 -13
- smftools/hmm/archived/apply_hmm_batched.py +2 -0
- smftools/hmm/archived/calculate_distances.py +2 -0
- smftools/hmm/archived/call_hmm_peaks.py +2 -0
- smftools/hmm/archived/train_hmm.py +2 -0
- smftools/hmm/call_hmm_peaks.py +5 -2
- smftools/hmm/display_hmm.py +4 -1
- smftools/hmm/hmm_readwrite.py +7 -2
- smftools/hmm/nucleosome_hmm_refinement.py +2 -0
- smftools/informatics/__init__.py +59 -34
- smftools/informatics/archived/bam_conversion.py +2 -0
- smftools/informatics/archived/bam_direct.py +2 -0
- smftools/informatics/archived/basecall_pod5s.py +2 -0
- smftools/informatics/archived/basecalls_to_adata.py +2 -0
- smftools/informatics/archived/conversion_smf.py +2 -0
- smftools/informatics/archived/deaminase_smf.py +1 -0
- smftools/informatics/archived/direct_smf.py +2 -0
- smftools/informatics/archived/fast5_to_pod5.py +2 -0
- smftools/informatics/archived/helpers/archived/__init__.py +2 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
- smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
- smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
- smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
- smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
- smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
- smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
- smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
- smftools/informatics/archived/helpers/archived/informatics.py +2 -0
- smftools/informatics/archived/helpers/archived/load_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
- smftools/informatics/archived/helpers/archived/modQC.py +2 -0
- smftools/informatics/archived/helpers/archived/modcall.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +2 -0
- smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
- smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
- smftools/informatics/archived/print_bam_query_seq.py +2 -0
- smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
- smftools/informatics/archived/subsample_pod5.py +2 -0
- smftools/informatics/bam_functions.py +1093 -176
- smftools/informatics/basecalling.py +2 -0
- smftools/informatics/bed_functions.py +271 -61
- smftools/informatics/binarize_converted_base_identities.py +3 -0
- smftools/informatics/complement_base_list.py +2 -0
- smftools/informatics/converted_BAM_to_adata.py +641 -176
- smftools/informatics/fasta_functions.py +94 -10
- smftools/informatics/h5ad_functions.py +123 -4
- smftools/informatics/modkit_extract_to_adata.py +1019 -431
- smftools/informatics/modkit_functions.py +2 -0
- smftools/informatics/ohe.py +2 -0
- smftools/informatics/pod5_functions.py +3 -2
- smftools/informatics/sequence_encoding.py +72 -0
- smftools/logging_utils.py +21 -2
- smftools/machine_learning/__init__.py +22 -6
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +18 -4
- smftools/machine_learning/data/preprocessing.py +2 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +2 -0
- smftools/machine_learning/evaluation/evaluators.py +14 -9
- smftools/machine_learning/inference/__init__.py +2 -0
- smftools/machine_learning/inference/inference_utils.py +2 -0
- smftools/machine_learning/inference/lightning_inference.py +6 -1
- smftools/machine_learning/inference/sklearn_inference.py +2 -0
- smftools/machine_learning/inference/sliding_window_inference.py +2 -0
- smftools/machine_learning/models/__init__.py +2 -0
- smftools/machine_learning/models/base.py +7 -2
- smftools/machine_learning/models/cnn.py +7 -2
- smftools/machine_learning/models/lightning_base.py +16 -11
- smftools/machine_learning/models/mlp.py +5 -1
- smftools/machine_learning/models/positional.py +7 -2
- smftools/machine_learning/models/rnn.py +5 -1
- smftools/machine_learning/models/sklearn_models.py +14 -9
- smftools/machine_learning/models/transformer.py +7 -2
- smftools/machine_learning/models/wrappers.py +6 -2
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +13 -3
- smftools/machine_learning/training/train_sklearn_model.py +2 -0
- smftools/machine_learning/utils/__init__.py +2 -0
- smftools/machine_learning/utils/device.py +5 -1
- smftools/machine_learning/utils/grl.py +5 -1
- smftools/metadata.py +1 -1
- smftools/optional_imports.py +31 -0
- smftools/plotting/__init__.py +41 -31
- smftools/plotting/autocorrelation_plotting.py +9 -5
- smftools/plotting/classifiers.py +16 -4
- smftools/plotting/general_plotting.py +2415 -629
- smftools/plotting/hmm_plotting.py +97 -9
- smftools/plotting/position_stats.py +15 -7
- smftools/plotting/qc_plotting.py +6 -1
- smftools/preprocessing/__init__.py +36 -37
- smftools/preprocessing/append_base_context.py +17 -17
- smftools/preprocessing/append_mismatch_frequency_sites.py +158 -0
- smftools/preprocessing/archived/add_read_length_and_mapping_qc.py +2 -0
- smftools/preprocessing/archived/calculate_complexity.py +2 -0
- smftools/preprocessing/archived/mark_duplicates.py +2 -0
- smftools/preprocessing/archived/preprocessing.py +2 -0
- smftools/preprocessing/archived/remove_duplicates.py +2 -0
- smftools/preprocessing/binary_layers_to_ohe.py +2 -1
- smftools/preprocessing/calculate_complexity_II.py +4 -1
- smftools/preprocessing/calculate_consensus.py +1 -1
- smftools/preprocessing/calculate_pairwise_differences.py +2 -0
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +3 -0
- smftools/preprocessing/calculate_position_Youden.py +9 -2
- smftools/preprocessing/calculate_read_modification_stats.py +6 -1
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +2 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +2 -0
- smftools/preprocessing/flag_duplicate_reads.py +42 -54
- smftools/preprocessing/make_dirs.py +2 -1
- smftools/preprocessing/min_non_diagonal.py +2 -0
- smftools/preprocessing/recipes.py +2 -0
- smftools/readwrite.py +53 -17
- smftools/schema/anndata_schema_v1.yaml +15 -1
- smftools/tools/__init__.py +30 -18
- smftools/tools/archived/apply_hmm.py +2 -0
- smftools/tools/archived/classifiers.py +2 -0
- smftools/tools/archived/classify_methylated_features.py +2 -0
- smftools/tools/archived/classify_non_methylated_features.py +2 -0
- smftools/tools/archived/subset_adata_v1.py +2 -0
- smftools/tools/archived/subset_adata_v2.py +2 -0
- smftools/tools/calculate_leiden.py +57 -0
- smftools/tools/calculate_nmf.py +119 -0
- smftools/tools/calculate_umap.py +93 -8
- smftools/tools/cluster_adata_on_methylation.py +7 -1
- smftools/tools/position_stats.py +17 -27
- smftools/tools/rolling_nn_distance.py +235 -0
- smftools/tools/tensor_factorization.py +169 -0
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/METADATA +69 -33
- smftools-0.3.1.dist-info/RECORD +189 -0
- smftools-0.2.5.dist-info/RECORD +0 -181
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/WHEEL +0 -0
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/licenses/LICENSE +0 -0
smftools/cli/spatial_adata.py
CHANGED
|
@@ -1,9 +1,14 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
1
4
|
from pathlib import Path
|
|
2
5
|
from typing import Optional, Tuple
|
|
3
6
|
|
|
4
7
|
import anndata as ad
|
|
5
8
|
|
|
6
|
-
from smftools.
|
|
9
|
+
from smftools.constants import LOGGING_DIR, SEQUENCE_INTEGER_ENCODING, SPATIAL_DIR
|
|
10
|
+
from smftools.logging_utils import get_logger, setup_logging
|
|
11
|
+
from smftools.optional_imports import require
|
|
7
12
|
|
|
8
13
|
logger = get_logger(__name__)
|
|
9
14
|
|
|
@@ -32,15 +37,13 @@ def spatial_adata(
|
|
|
32
37
|
Path to the “current” spatial AnnData (or hmm AnnData if we skip to that).
|
|
33
38
|
"""
|
|
34
39
|
from ..readwrite import add_or_update_column_in_csv, safe_read_h5ad
|
|
35
|
-
from .helpers import get_adata_paths
|
|
36
|
-
from .load_adata import load_adata
|
|
37
|
-
from .preprocess_adata import preprocess_adata
|
|
40
|
+
from .helpers import get_adata_paths, load_experiment_config
|
|
38
41
|
|
|
39
42
|
# 1) Ensure config + basic paths via load_adata
|
|
40
|
-
|
|
43
|
+
cfg = load_experiment_config(config_path)
|
|
44
|
+
|
|
41
45
|
paths = get_adata_paths(cfg)
|
|
42
46
|
|
|
43
|
-
raw_path = paths.raw
|
|
44
47
|
pp_path = paths.pp
|
|
45
48
|
pp_dedup_path = paths.pp_dedup
|
|
46
49
|
spatial_path = paths.spatial
|
|
@@ -48,47 +51,34 @@ def spatial_adata(
|
|
|
48
51
|
|
|
49
52
|
# Stage-skipping logic for spatial
|
|
50
53
|
if not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
51
|
-
# If HMM exists, it's the most processed stage — reuse it.
|
|
52
|
-
if hmm_path.exists():
|
|
53
|
-
logger.info(f"HMM AnnData found: {hmm_path}\nSkipping smftools spatial")
|
|
54
|
-
return None, hmm_path
|
|
55
|
-
|
|
56
54
|
# If spatial exists, we consider spatial analyses already done.
|
|
57
55
|
if spatial_path.exists():
|
|
58
56
|
logger.info(f"Spatial AnnData found: {spatial_path}\nSkipping smftools spatial")
|
|
59
57
|
return None, spatial_path
|
|
60
58
|
|
|
61
|
-
# 2) Ensure preprocessing has been run
|
|
62
|
-
# This will create pp/pp_dedup as needed or return them if they already exist.
|
|
63
|
-
pp_adata, pp_adata_path_ret, pp_dedup_adata, pp_dedup_adata_path_ret = preprocess_adata(
|
|
64
|
-
config_path
|
|
65
|
-
)
|
|
66
|
-
|
|
67
59
|
# Helper to load from disk, reusing loaded_adata if it matches
|
|
68
60
|
def _load(path: Path):
|
|
69
|
-
if loaded_adata is not None and loaded_path == path:
|
|
70
|
-
return loaded_adata
|
|
71
61
|
adata, _ = safe_read_h5ad(path)
|
|
72
62
|
return adata
|
|
73
63
|
|
|
74
64
|
# 3) Decide which AnnData to use as the *starting point* for spatial analyses
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
65
|
+
if hmm_path.exists():
|
|
66
|
+
start_adata = _load(hmm_path)
|
|
67
|
+
source_path = hmm_path
|
|
68
|
+
elif spatial_path.exists():
|
|
69
|
+
start_adata = _load(spatial_path)
|
|
70
|
+
source_path = spatial_path
|
|
71
|
+
elif pp_dedup_path.exists():
|
|
72
|
+
start_adata = _load(pp_dedup_path)
|
|
73
|
+
source_path = pp_dedup_path
|
|
74
|
+
elif pp_path.exists():
|
|
75
|
+
start_adata = _load(pp_path)
|
|
76
|
+
source_path = pp_path
|
|
79
77
|
else:
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
start_adata = _load(pp_path)
|
|
85
|
-
source_path = pp_path
|
|
86
|
-
elif raw_path.exists():
|
|
87
|
-
start_adata = _load(raw_path)
|
|
88
|
-
source_path = raw_path
|
|
89
|
-
else:
|
|
90
|
-
logger.warning("No suitable AnnData found for spatial analyses (need at least raw).")
|
|
91
|
-
return None, None
|
|
78
|
+
logger.warning(
|
|
79
|
+
"No suitable AnnData found for spatial analyses (need at least preprocessed)."
|
|
80
|
+
)
|
|
81
|
+
return None, None
|
|
92
82
|
|
|
93
83
|
# 4) Run the spatial core
|
|
94
84
|
adata_spatial, spatial_path = spatial_adata_core(
|
|
@@ -96,15 +86,10 @@ def spatial_adata(
|
|
|
96
86
|
cfg=cfg,
|
|
97
87
|
spatial_adata_path=spatial_path,
|
|
98
88
|
pp_adata_path=pp_path,
|
|
99
|
-
pp_dup_rem_adata_path=pp_dedup_path,
|
|
100
|
-
pp_adata_in_memory=pp_adata,
|
|
101
89
|
source_adata_path=source_path,
|
|
102
90
|
config_path=config_path,
|
|
103
91
|
)
|
|
104
92
|
|
|
105
|
-
# 5) Register spatial path in summary CSV
|
|
106
|
-
add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", spatial_path)
|
|
107
|
-
|
|
108
93
|
return adata_spatial, spatial_path
|
|
109
94
|
|
|
110
95
|
|
|
@@ -113,8 +98,6 @@ def spatial_adata_core(
|
|
|
113
98
|
cfg,
|
|
114
99
|
spatial_adata_path: Path,
|
|
115
100
|
pp_adata_path: Path,
|
|
116
|
-
pp_dup_rem_adata_path: Path,
|
|
117
|
-
pp_adata_in_memory: Optional[ad.AnnData] = None,
|
|
118
101
|
source_adata_path: Optional[Path] = None,
|
|
119
102
|
config_path: Optional[str] = None,
|
|
120
103
|
) -> Tuple[ad.AnnData, Path]:
|
|
@@ -126,8 +109,6 @@ def spatial_adata_core(
|
|
|
126
109
|
- `cfg` is the ExperimentConfig.
|
|
127
110
|
- `spatial_adata_path`, `pp_adata_path`, `pp_dup_rem_adata_path` are canonical paths
|
|
128
111
|
from `get_adata_paths`.
|
|
129
|
-
- `pp_adata_in_memory` optionally holds the preprocessed (non-dedup) AnnData from
|
|
130
|
-
the same run of `preprocess_adata`, to avoid re-reading from disk.
|
|
131
112
|
|
|
132
113
|
Does:
|
|
133
114
|
- Optional sample sheet load.
|
|
@@ -149,16 +130,17 @@ def spatial_adata_core(
|
|
|
149
130
|
"""
|
|
150
131
|
import os
|
|
151
132
|
import warnings
|
|
133
|
+
from datetime import datetime
|
|
152
134
|
from pathlib import Path
|
|
153
135
|
|
|
154
136
|
import numpy as np
|
|
155
137
|
import pandas as pd
|
|
156
|
-
import scanpy as sc
|
|
157
138
|
|
|
158
139
|
from ..metadata import record_smftools_metadata
|
|
159
140
|
from ..plotting import (
|
|
160
141
|
combined_raw_clustermap,
|
|
161
142
|
plot_rolling_grid,
|
|
143
|
+
plot_rolling_nn_and_layer,
|
|
162
144
|
plot_spatial_autocorr_grid,
|
|
163
145
|
)
|
|
164
146
|
from ..preprocessing import (
|
|
@@ -167,11 +149,12 @@ def spatial_adata_core(
|
|
|
167
149
|
reindex_references_adata,
|
|
168
150
|
)
|
|
169
151
|
from ..readwrite import make_dirs, safe_read_h5ad
|
|
170
|
-
from ..tools import
|
|
152
|
+
from ..tools import rolling_window_nn_distance
|
|
171
153
|
from ..tools.position_stats import (
|
|
172
154
|
compute_positionwise_statistics,
|
|
173
155
|
plot_positionwise_matrices,
|
|
174
156
|
)
|
|
157
|
+
from ..tools.rolling_nn_distance import assign_rolling_nn_results
|
|
175
158
|
from ..tools.spatial_autocorrelation import (
|
|
176
159
|
analyze_autocorr_matrix,
|
|
177
160
|
binary_autocorrelation_with_spacing,
|
|
@@ -183,8 +166,24 @@ def spatial_adata_core(
|
|
|
183
166
|
# -----------------------------
|
|
184
167
|
# General setup
|
|
185
168
|
# -----------------------------
|
|
169
|
+
date_str = datetime.today().strftime("%y%m%d")
|
|
170
|
+
now = datetime.now()
|
|
171
|
+
time_str = now.strftime("%H%M%S")
|
|
172
|
+
log_level = getattr(logging, cfg.log_level.upper(), logging.INFO)
|
|
173
|
+
|
|
186
174
|
output_directory = Path(cfg.output_directory)
|
|
187
|
-
|
|
175
|
+
spatial_directory = output_directory / SPATIAL_DIR
|
|
176
|
+
logging_directory = spatial_directory / LOGGING_DIR
|
|
177
|
+
|
|
178
|
+
make_dirs([output_directory, spatial_directory])
|
|
179
|
+
|
|
180
|
+
if cfg.emit_log_file:
|
|
181
|
+
log_file = logging_directory / f"{date_str}_{time_str}_log.log"
|
|
182
|
+
make_dirs([logging_directory])
|
|
183
|
+
else:
|
|
184
|
+
log_file = None
|
|
185
|
+
|
|
186
|
+
setup_logging(level=log_level, log_file=log_file, reconfigure=log_file is not None)
|
|
188
187
|
|
|
189
188
|
smf_modality = cfg.smf_modality
|
|
190
189
|
if smf_modality == "conversion":
|
|
@@ -192,8 +191,6 @@ def spatial_adata_core(
|
|
|
192
191
|
else:
|
|
193
192
|
deaminase = True
|
|
194
193
|
|
|
195
|
-
first_pp_run = pp_adata_in_memory is not None and pp_dup_rem_adata_path.exists()
|
|
196
|
-
|
|
197
194
|
# -----------------------------
|
|
198
195
|
# Optional sample sheet metadata
|
|
199
196
|
# -----------------------------
|
|
@@ -227,7 +224,6 @@ def spatial_adata_core(
|
|
|
227
224
|
else:
|
|
228
225
|
reindex_suffix = None
|
|
229
226
|
|
|
230
|
-
pp_dir = output_directory / "preprocessed"
|
|
231
227
|
references = adata.obs[cfg.reference_column].cat.categories
|
|
232
228
|
|
|
233
229
|
# ============================================================
|
|
@@ -237,7 +233,7 @@ def spatial_adata_core(
|
|
|
237
233
|
preprocessed_version_available = pp_adata_path.exists()
|
|
238
234
|
|
|
239
235
|
if preprocessed_version_available:
|
|
240
|
-
pp_clustermap_dir =
|
|
236
|
+
pp_clustermap_dir = spatial_directory / "06_clustermaps"
|
|
241
237
|
|
|
242
238
|
if pp_clustermap_dir.is_dir() and not getattr(
|
|
243
239
|
cfg, "force_redo_spatial_analyses", False
|
|
@@ -246,12 +242,9 @@ def spatial_adata_core(
|
|
|
246
242
|
f"{pp_clustermap_dir} already exists. Skipping clustermap plotting for preprocessed AnnData."
|
|
247
243
|
)
|
|
248
244
|
else:
|
|
249
|
-
make_dirs([
|
|
245
|
+
make_dirs([spatial_directory, pp_clustermap_dir])
|
|
250
246
|
|
|
251
|
-
|
|
252
|
-
pp_adata = pp_adata_in_memory
|
|
253
|
-
else:
|
|
254
|
-
pp_adata, _ = safe_read_h5ad(pp_adata_path)
|
|
247
|
+
pp_adata, _ = safe_read_h5ad(pp_adata_path)
|
|
255
248
|
|
|
256
249
|
# -----------------------------
|
|
257
250
|
# Optional sample sheet metadata
|
|
@@ -300,7 +293,7 @@ def spatial_adata_core(
|
|
|
300
293
|
0
|
|
301
294
|
],
|
|
302
295
|
min_position_valid_fraction=cfg.min_valid_fraction_positions_in_read_vs_ref,
|
|
303
|
-
demux_types=
|
|
296
|
+
demux_types=cfg.clustermap_demux_types_to_plot,
|
|
304
297
|
bins=None,
|
|
305
298
|
sample_mapping=None,
|
|
306
299
|
save_path=pp_clustermap_dir,
|
|
@@ -310,19 +303,18 @@ def spatial_adata_core(
|
|
|
310
303
|
)
|
|
311
304
|
|
|
312
305
|
# ============================================================
|
|
313
|
-
# 2) Clustermaps
|
|
306
|
+
# 2) Clustermaps on *deduplicated* preprocessed AnnData
|
|
314
307
|
# ============================================================
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
pp_umap_dir = pp_dir_dedup / "07_umaps"
|
|
308
|
+
spatial_dir_dedup = spatial_directory / "deduplicated"
|
|
309
|
+
clustermap_dir_dedup = spatial_dir_dedup / "06_clustermaps"
|
|
318
310
|
|
|
319
311
|
# Clustermaps on deduplicated adata
|
|
320
|
-
if
|
|
312
|
+
if clustermap_dir_dedup.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
321
313
|
logger.debug(
|
|
322
|
-
f"{
|
|
314
|
+
f"{clustermap_dir_dedup} already exists. Skipping clustermap plotting for deduplicated AnnData."
|
|
323
315
|
)
|
|
324
316
|
else:
|
|
325
|
-
make_dirs([
|
|
317
|
+
make_dirs([spatial_dir_dedup, clustermap_dir_dedup])
|
|
326
318
|
combined_raw_clustermap(
|
|
327
319
|
adata,
|
|
328
320
|
sample_col=cfg.sample_name_col_for_plotting,
|
|
@@ -342,53 +334,113 @@ def spatial_adata_core(
|
|
|
342
334
|
0
|
|
343
335
|
],
|
|
344
336
|
min_position_valid_fraction=1 - cfg.position_max_nan_threshold,
|
|
345
|
-
demux_types=
|
|
337
|
+
demux_types=cfg.clustermap_demux_types_to_plot,
|
|
346
338
|
bins=None,
|
|
347
339
|
sample_mapping=None,
|
|
348
|
-
save_path=
|
|
340
|
+
save_path=clustermap_dir_dedup,
|
|
349
341
|
sort_by=cfg.spatial_clustermap_sortby,
|
|
350
342
|
deaminase=deaminase,
|
|
351
343
|
index_col_suffix=reindex_suffix,
|
|
352
344
|
)
|
|
353
345
|
|
|
354
|
-
#
|
|
355
|
-
|
|
356
|
-
|
|
346
|
+
# ============================================================
|
|
347
|
+
# 2b) Rolling NN distances + layer clustermaps
|
|
348
|
+
# ============================================================
|
|
349
|
+
pp_rolling_nn_dir = spatial_dir_dedup / "06b_rolling_nn_clustermaps"
|
|
350
|
+
|
|
351
|
+
if pp_rolling_nn_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
352
|
+
logger.debug(f"{pp_rolling_nn_dir} already exists. Skipping rolling NN distance plots.")
|
|
357
353
|
else:
|
|
358
|
-
make_dirs([
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
if smf_modality == "direct":
|
|
362
|
-
for ref in references:
|
|
363
|
-
for base in cfg.mod_target_bases:
|
|
364
|
-
var_filters.append(f"{ref}_{base}_site")
|
|
365
|
-
elif deaminase:
|
|
366
|
-
for ref in references:
|
|
367
|
-
var_filters.append(f"{ref}_C_site")
|
|
368
|
-
else:
|
|
369
|
-
for ref in references:
|
|
370
|
-
for base in cfg.mod_target_bases:
|
|
371
|
-
var_filters.append(f"{ref}_{base}_site")
|
|
372
|
-
|
|
373
|
-
adata = calculate_umap(
|
|
374
|
-
adata,
|
|
375
|
-
layer=cfg.layer_for_umap_plotting,
|
|
376
|
-
var_filters=var_filters,
|
|
377
|
-
n_pcs=10,
|
|
378
|
-
knn_neighbors=15,
|
|
354
|
+
make_dirs([pp_rolling_nn_dir])
|
|
355
|
+
samples = (
|
|
356
|
+
adata.obs[cfg.sample_name_col_for_plotting].astype("category").cat.categories.tolist()
|
|
379
357
|
)
|
|
358
|
+
references = adata.obs[cfg.reference_column].astype("category").cat.categories.tolist()
|
|
380
359
|
|
|
381
|
-
|
|
360
|
+
for reference in references:
|
|
361
|
+
for sample in samples:
|
|
362
|
+
mask = (adata.obs[cfg.sample_name_col_for_plotting] == sample) & (
|
|
363
|
+
adata.obs[cfg.reference_column] == reference
|
|
364
|
+
)
|
|
365
|
+
if not mask.any():
|
|
366
|
+
continue
|
|
367
|
+
|
|
368
|
+
subset = adata[mask]
|
|
369
|
+
site_mask = (
|
|
370
|
+
adata.var[[f"{reference}_{st}_site" for st in cfg.rolling_nn_site_types]]
|
|
371
|
+
.fillna(False)
|
|
372
|
+
.any(axis=1)
|
|
373
|
+
)
|
|
374
|
+
subset = subset[:, site_mask].copy()
|
|
375
|
+
try:
|
|
376
|
+
rolling_values, rolling_starts = rolling_window_nn_distance(
|
|
377
|
+
subset,
|
|
378
|
+
layer=cfg.rolling_nn_layer,
|
|
379
|
+
window=cfg.rolling_nn_window,
|
|
380
|
+
step=cfg.rolling_nn_step,
|
|
381
|
+
min_overlap=cfg.rolling_nn_min_overlap,
|
|
382
|
+
return_fraction=cfg.rolling_nn_return_fraction,
|
|
383
|
+
store_obsm=cfg.rolling_nn_obsm_key,
|
|
384
|
+
)
|
|
385
|
+
except Exception as exc:
|
|
386
|
+
logger.warning(
|
|
387
|
+
"Rolling NN distance computation failed for sample=%s ref=%s: %s",
|
|
388
|
+
sample,
|
|
389
|
+
reference,
|
|
390
|
+
exc,
|
|
391
|
+
)
|
|
392
|
+
continue
|
|
382
393
|
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
394
|
+
safe_sample = str(sample).replace(os.sep, "_")
|
|
395
|
+
safe_ref = str(reference).replace(os.sep, "_")
|
|
396
|
+
parent_obsm_key = f"{cfg.rolling_nn_obsm_key}__{safe_ref}"
|
|
397
|
+
try:
|
|
398
|
+
assign_rolling_nn_results(
|
|
399
|
+
adata,
|
|
400
|
+
subset,
|
|
401
|
+
rolling_values,
|
|
402
|
+
rolling_starts,
|
|
403
|
+
obsm_key=parent_obsm_key,
|
|
404
|
+
window=cfg.rolling_nn_window,
|
|
405
|
+
step=cfg.rolling_nn_step,
|
|
406
|
+
min_overlap=cfg.rolling_nn_min_overlap,
|
|
407
|
+
return_fraction=cfg.rolling_nn_return_fraction,
|
|
408
|
+
layer=cfg.rolling_nn_layer,
|
|
409
|
+
)
|
|
410
|
+
except Exception as exc:
|
|
411
|
+
logger.warning(
|
|
412
|
+
"Failed to merge rolling NN results for sample=%s ref=%s: %s",
|
|
413
|
+
sample,
|
|
414
|
+
reference,
|
|
415
|
+
exc,
|
|
416
|
+
)
|
|
417
|
+
adata.uns.setdefault(f"{cfg.rolling_nn_obsm_key}_reference_map", {})[reference] = (
|
|
418
|
+
parent_obsm_key
|
|
419
|
+
)
|
|
420
|
+
out_png = pp_rolling_nn_dir / f"{safe_sample}__{safe_ref}.png"
|
|
421
|
+
title = f"{sample} {reference}"
|
|
422
|
+
try:
|
|
423
|
+
plot_rolling_nn_and_layer(
|
|
424
|
+
subset,
|
|
425
|
+
obsm_key=cfg.rolling_nn_obsm_key,
|
|
426
|
+
layer_key=cfg.rolling_nn_plot_layer,
|
|
427
|
+
max_nan_fraction=cfg.position_max_nan_threshold,
|
|
428
|
+
var_valid_fraction_col=f"{reference}_valid_fraction",
|
|
429
|
+
title=title,
|
|
430
|
+
save_name=out_png,
|
|
431
|
+
)
|
|
432
|
+
except Exception as exc:
|
|
433
|
+
logger.warning(
|
|
434
|
+
"Failed rolling NN plot for sample=%s ref=%s: %s",
|
|
435
|
+
sample,
|
|
436
|
+
reference,
|
|
437
|
+
exc,
|
|
438
|
+
)
|
|
387
439
|
|
|
388
440
|
# ============================================================
|
|
389
441
|
# 3) Spatial autocorrelation + rolling metrics
|
|
390
442
|
# ============================================================
|
|
391
|
-
pp_autocorr_dir =
|
|
443
|
+
pp_autocorr_dir = spatial_dir_dedup / "08_autocorrelations"
|
|
392
444
|
|
|
393
445
|
if pp_autocorr_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
394
446
|
logger.debug(f"{pp_autocorr_dir} already exists. Skipping autocorrelation plotting.")
|
|
@@ -731,10 +783,10 @@ def spatial_adata_core(
|
|
|
731
783
|
# ============================================================
|
|
732
784
|
# 4) Pearson / correlation matrices
|
|
733
785
|
# ============================================================
|
|
734
|
-
|
|
786
|
+
corr_dir = spatial_dir_dedup / "09_correlation_matrices"
|
|
735
787
|
|
|
736
|
-
if
|
|
737
|
-
logger.debug(f"{
|
|
788
|
+
if corr_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
|
|
789
|
+
logger.debug(f"{corr_dir} already exists. Skipping correlation matrix plotting.")
|
|
738
790
|
else:
|
|
739
791
|
compute_positionwise_statistics(
|
|
740
792
|
adata,
|
|
@@ -759,7 +811,7 @@ def spatial_adata_core(
|
|
|
759
811
|
cmaps=cfg.correlation_matrix_cmaps,
|
|
760
812
|
vmin=None,
|
|
761
813
|
vmax=None,
|
|
762
|
-
output_dir=
|
|
814
|
+
output_dir=corr_dir,
|
|
763
815
|
output_key="positionwise_result",
|
|
764
816
|
)
|
|
765
817
|
|
smftools/cli_entry.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
4
|
from pathlib import Path
|
|
3
5
|
from typing import Sequence
|
|
@@ -6,14 +8,37 @@ import click
|
|
|
6
8
|
import pandas as pd
|
|
7
9
|
|
|
8
10
|
from .cli.hmm_adata import hmm_adata
|
|
11
|
+
from .cli.latent_adata import latent_adata
|
|
9
12
|
from .cli.load_adata import load_adata
|
|
10
13
|
from .cli.preprocess_adata import preprocess_adata
|
|
11
14
|
from .cli.spatial_adata import spatial_adata
|
|
12
15
|
from .informatics.pod5_functions import subsample_pod5
|
|
13
|
-
from .logging_utils import setup_logging
|
|
16
|
+
from .logging_utils import get_logger, setup_logging
|
|
14
17
|
from .readwrite import concatenate_h5ads
|
|
15
18
|
|
|
16
19
|
|
|
20
|
+
def _configure_multiprocessing() -> None:
|
|
21
|
+
import multiprocessing as mp
|
|
22
|
+
import sys
|
|
23
|
+
|
|
24
|
+
logger = get_logger(__name__)
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
if sys.platform == "win32":
|
|
28
|
+
mp.set_start_method("spawn")
|
|
29
|
+
logger.debug("Setting multiprocessing start method to spawn")
|
|
30
|
+
else:
|
|
31
|
+
# try forkserver first, fallback to spawn
|
|
32
|
+
try:
|
|
33
|
+
mp.set_start_method("forkserver")
|
|
34
|
+
logger.debug("Setting multiprocessing start method to forkserver")
|
|
35
|
+
except ValueError:
|
|
36
|
+
mp.set_start_method("spawn")
|
|
37
|
+
logger.debug("Setting multiprocessing start method to spawn")
|
|
38
|
+
except RuntimeError:
|
|
39
|
+
logger.warning("Could not set multiprocessing start method")
|
|
40
|
+
|
|
41
|
+
|
|
17
42
|
@click.group()
|
|
18
43
|
@click.option(
|
|
19
44
|
"--log-file",
|
|
@@ -32,6 +57,7 @@ def cli(log_file: Path | None, log_level: str):
|
|
|
32
57
|
"""Command-line interface for smftools."""
|
|
33
58
|
level = getattr(logging, log_level.upper(), logging.INFO)
|
|
34
59
|
setup_logging(level=level, log_file=log_file)
|
|
60
|
+
_configure_multiprocessing()
|
|
35
61
|
|
|
36
62
|
|
|
37
63
|
####### Load anndata from raw data ###########
|
|
@@ -78,6 +104,17 @@ def hmm(config_path):
|
|
|
78
104
|
##########################################
|
|
79
105
|
|
|
80
106
|
|
|
107
|
+
####### Latent ###########
|
|
108
|
+
@cli.command()
|
|
109
|
+
@click.argument("config_path", type=click.Path(exists=True))
|
|
110
|
+
def latent(config_path):
|
|
111
|
+
"""Process data from CONFIG_PATH."""
|
|
112
|
+
latent_adata(config_path)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
##########################################
|
|
116
|
+
|
|
117
|
+
|
|
81
118
|
####### batch command ###########
|
|
82
119
|
@cli.command()
|
|
83
120
|
@click.argument(
|
smftools/config/__init__.py
CHANGED
smftools/config/conversion.yaml
CHANGED
|
@@ -15,6 +15,16 @@ autocorr_site_types:
|
|
|
15
15
|
|
|
16
16
|
# Spatial Analysis - Clustermap params
|
|
17
17
|
layer_for_clustermap_plotting: 'nan0_0minus1'
|
|
18
|
+
rolling_nn_layer: "nan0_0minus1"
|
|
19
|
+
rolling_nn_plot_layer: "nan0_0minus1"
|
|
20
|
+
rolling_nn_window: 30
|
|
21
|
+
rolling_nn_step: 2
|
|
22
|
+
rolling_nn_min_overlap: 20
|
|
23
|
+
rolling_nn_return_fraction: true
|
|
24
|
+
rolling_nn_obsm_key: "rolling_nn_dist"
|
|
25
|
+
rolling_nn_site_types:
|
|
26
|
+
- "GpC"
|
|
27
|
+
- "CpG"
|
|
18
28
|
clustermap_cmap_c: "coolwarm"
|
|
19
29
|
clustermap_cmap_gpc: "coolwarm"
|
|
20
30
|
clustermap_cmap_cpg: "viridis"
|
|
@@ -46,4 +56,4 @@ hmm_feature_sets:
|
|
|
46
56
|
cpg_patch: [0, inf]
|
|
47
57
|
|
|
48
58
|
hmm_merge_layer_features:
|
|
49
|
-
- ["all_accessible_features", 60]
|
|
59
|
+
- ["all_accessible_features", 60]
|
smftools/config/default.yaml
CHANGED
|
@@ -18,8 +18,9 @@ conversions:
|
|
|
18
18
|
fastq_barcode_map: null # For FASTQ files, an optional map of file paths to barcodes can be provided. Default is autodetecting barcodes.
|
|
19
19
|
fastq_auto_pairing: True # For FASTQ files, attempt to find read pair files automatically.
|
|
20
20
|
input_already_demuxed: False # If the input files are already demultiplexed.
|
|
21
|
+
|
|
21
22
|
delete_intermediate_hdfs: True # Whether to delete the intermediate hdfs from the conversion/deamination workflows.
|
|
22
|
-
delete_intermediate_bams:
|
|
23
|
+
delete_intermediate_bams: False # Whether to delete intermediate BAM files.
|
|
23
24
|
delete_intermediate_tsvs: True # Whether to delete intermediate TSV files.
|
|
24
25
|
|
|
25
26
|
# Sequencing modality and general experiment params
|
|
@@ -77,6 +78,10 @@ aligner_args:
|
|
|
77
78
|
# Sorted BAM and BED specific handling
|
|
78
79
|
make_bigwigs: False # Whether to make coverage bigwigs
|
|
79
80
|
make_beds: False # Whether to make beds from the aligned bams
|
|
81
|
+
annotate_secondary_supplementary: True # Whether to annotate reads with secondary/supplementary alignments from the aligned BAM
|
|
82
|
+
samtools_backend: auto # auto|python|cli for samtools-compatible operations
|
|
83
|
+
bedtools_backend: auto # auto|python|cli for bedtools-compatible operations
|
|
84
|
+
bigwig_backend: auto # auto|python|cli for bedGraphToBigWig conversion
|
|
80
85
|
|
|
81
86
|
# Nanopore specific demultiplexing
|
|
82
87
|
barcode_both_ends: False # dorado demultiplexing
|
|
@@ -87,6 +92,12 @@ mapping_threshold: 0.10 # Minimum proportion of mapped reads that need to fall w
|
|
|
87
92
|
reference_column: 'Reference_strand'
|
|
88
93
|
sample_column: 'Experiment_name_and_barcode'
|
|
89
94
|
|
|
95
|
+
# Plotting params
|
|
96
|
+
clustermap_demux_types_to_plot:
|
|
97
|
+
- "single"
|
|
98
|
+
- "double"
|
|
99
|
+
- "already"
|
|
100
|
+
|
|
90
101
|
######## smftools preprocess params #########
|
|
91
102
|
# Read length, quality, and mapping filtering params
|
|
92
103
|
read_coord_filter:
|
|
@@ -137,6 +148,10 @@ duplicate_detection_site_types: # Site types to consider for duplicate detection
|
|
|
137
148
|
- "CpG"
|
|
138
149
|
- "ambiguous_GpC_CpG"
|
|
139
150
|
duplicate_detection_distance_threshold: 0.07 # Hamming distance based similarity threshold to use for marking duplicate reads.
|
|
151
|
+
duplicate_detection_demux_types_to_use:
|
|
152
|
+
- "single"
|
|
153
|
+
- "double"
|
|
154
|
+
- "already"
|
|
140
155
|
hamming_vs_metric_keys: # Metrics to plot the hamming distance against.
|
|
141
156
|
- Fraction_C_site_modified
|
|
142
157
|
duplicate_detection_keep_best_metric: "read_quality" # Obs metric to use to keep a representative read from a read duplicate cluster
|
|
@@ -148,6 +163,11 @@ duplicate_detection_do_pca: False # Whether to do PCA before hierarchical linkag
|
|
|
148
163
|
|
|
149
164
|
# Position QC params
|
|
150
165
|
position_max_nan_threshold: 0.1 # The maximum amount of nans to tolerate in a column
|
|
166
|
+
mismatch_frequency_range:
|
|
167
|
+
- 0.01
|
|
168
|
+
- 0.99
|
|
169
|
+
mismatch_frequency_layer: "mismatch_integer_encoding"
|
|
170
|
+
mismatch_frequency_read_span_layer: "read_span_mask"
|
|
151
171
|
|
|
152
172
|
######## smftools spatial params #########
|
|
153
173
|
invert_adata: False # Whether to invert the AnnData along the positions axis.
|
|
@@ -166,6 +186,9 @@ clustermap_cmap_gpc: "coolwarm"
|
|
|
166
186
|
clustermap_cmap_cpg: "coolwarm"
|
|
167
187
|
clustermap_cmap_a: "coolwarm"
|
|
168
188
|
spatial_clustermap_sortby: "gpc"
|
|
189
|
+
rolling_nn_site_types:
|
|
190
|
+
- "GpC"
|
|
191
|
+
- "CpG"
|
|
169
192
|
|
|
170
193
|
# Spatial Analysis - UMAP/Leiden params
|
|
171
194
|
layer_for_umap_plotting: 'nan_half'
|
|
@@ -240,6 +263,18 @@ hmm_feature_sets:
|
|
|
240
263
|
mid_accessible_patch: [20, 40]
|
|
241
264
|
large_accessible_patch: [40, 110]
|
|
242
265
|
nucleosome_depleted_region: [110, inf]
|
|
266
|
+
hmm_feature_colormaps:
|
|
267
|
+
small_accessible_patch: "#A5D6A7"
|
|
268
|
+
mid_accessible_patch: "#2E7D32"
|
|
269
|
+
large_accessible_patch: "#006400"
|
|
270
|
+
nucleosome_depleted_region: "#00441B"
|
|
271
|
+
all_accessible_features: "#2E7D32"
|
|
272
|
+
small_bound_stretch: "#1E88E5"
|
|
273
|
+
medium_bound_stretch: "#6A1B9A"
|
|
274
|
+
large_bound_stretch: "#FB8C00"
|
|
275
|
+
putative_nucleosome: "#6D4C41"
|
|
276
|
+
all_footprint_features: "#6A1B9A"
|
|
277
|
+
cpg_patch: "#6D4C41"
|
|
243
278
|
hmm_merge_layer_features:
|
|
244
279
|
- ["all_accessible_features", 60]
|
|
245
280
|
clustermap_cmap_hmm: "coolwarm"
|
|
@@ -256,6 +291,11 @@ hmm_clustermap_feature_layers:
|
|
|
256
291
|
- medium_bound_stretch
|
|
257
292
|
- putative_nucleosome
|
|
258
293
|
- large_bound_stretch
|
|
294
|
+
- all_footprint_features
|
|
295
|
+
hmm_clustermap_length_layers:
|
|
296
|
+
- all_accessible_features
|
|
297
|
+
- all_accessible_features_merged
|
|
298
|
+
- all_footprint_features
|
|
259
299
|
hmm_clustermap_sortby: "hmm"
|
|
260
300
|
hmm_peak_feature_configs:
|
|
261
301
|
all_accessible_features:
|
|
@@ -370,4 +410,4 @@ force_redo_matrix_corr_plotting: False # Whether to force redo basic correlation
|
|
|
370
410
|
bypass_hmm_fit: False # Whether to skip HMM fitting for each sample/reference
|
|
371
411
|
force_redo_hmm_fit: False # Whether to redo HMM fitting for each sample/reference
|
|
372
412
|
bypass_hmm_apply: False # Whether to skip HMM application for each sample/reference
|
|
373
|
-
force_redo_hmm_apply: False # Whether to redo HMM application for each sample/reference
|
|
413
|
+
force_redo_hmm_apply: False # Whether to redo HMM application for each sample/reference
|