smftools 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +2 -6
- smftools/_version.py +1 -1
- smftools/cli/__init__.py +0 -0
- smftools/cli/cli_flows.py +94 -0
- smftools/cli/hmm_adata.py +338 -0
- smftools/cli/load_adata.py +577 -0
- smftools/cli/preprocess_adata.py +363 -0
- smftools/cli/spatial_adata.py +564 -0
- smftools/cli_entry.py +435 -0
- smftools/config/conversion.yaml +11 -6
- smftools/config/deaminase.yaml +12 -7
- smftools/config/default.yaml +36 -25
- smftools/config/direct.yaml +25 -1
- smftools/config/discover_input_files.py +115 -0
- smftools/config/experiment_config.py +109 -12
- smftools/informatics/__init__.py +13 -7
- smftools/informatics/archived/fast5_to_pod5.py +43 -0
- smftools/informatics/archived/helpers/archived/__init__.py +71 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
- smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
- smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
- smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
- smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
- smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
- smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
- smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
- smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
- smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
- smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
- smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
- smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
- smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
- smftools/informatics/bam_functions.py +812 -0
- smftools/informatics/basecalling.py +67 -0
- smftools/informatics/bed_functions.py +366 -0
- smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
- smftools/informatics/fasta_functions.py +255 -0
- smftools/informatics/h5ad_functions.py +197 -0
- smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
- smftools/informatics/modkit_functions.py +129 -0
- smftools/informatics/ohe.py +160 -0
- smftools/informatics/pod5_functions.py +224 -0
- smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
- smftools/plotting/autocorrelation_plotting.py +1 -3
- smftools/plotting/general_plotting.py +1037 -362
- smftools/preprocessing/__init__.py +2 -0
- smftools/preprocessing/append_base_context.py +3 -3
- smftools/preprocessing/append_binary_layer_by_base_context.py +4 -4
- smftools/preprocessing/binarize.py +17 -0
- smftools/preprocessing/binarize_on_Youden.py +2 -2
- smftools/preprocessing/calculate_position_Youden.py +1 -1
- smftools/preprocessing/calculate_read_modification_stats.py +1 -1
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +19 -19
- smftools/preprocessing/flag_duplicate_reads.py +1 -1
- smftools/readwrite.py +266 -140
- {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/METADATA +10 -9
- {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/RECORD +82 -70
- smftools-0.2.3.dist-info/entry_points.txt +2 -0
- smftools/cli.py +0 -184
- smftools/informatics/fast5_to_pod5.py +0 -24
- smftools/informatics/helpers/__init__.py +0 -73
- smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
- smftools/informatics/helpers/bam_qc.py +0 -66
- smftools/informatics/helpers/bed_to_bigwig.py +0 -39
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
- smftools/informatics/helpers/discover_input_files.py +0 -100
- smftools/informatics/helpers/index_fasta.py +0 -12
- smftools/informatics/helpers/make_dirs.py +0 -21
- smftools/informatics/readwrite.py +0 -106
- smftools/informatics/subsample_fasta_from_bed.py +0 -47
- smftools/load_adata.py +0 -1346
- smftools-0.2.1.dist-info/entry_points.txt +0 -2
- /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
- /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
- /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
- /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,564 @@
|
|
|
1
|
+
def spatial_adata(config_path):
|
|
2
|
+
"""
|
|
3
|
+
High-level function to call for spatial analysis of an adata object.
|
|
4
|
+
Command line accesses this through smftools spatial <config_path>
|
|
5
|
+
|
|
6
|
+
Parameters:
|
|
7
|
+
config_path (str): A string representing the file path to the experiment configuration csv file.
|
|
8
|
+
|
|
9
|
+
Returns:
|
|
10
|
+
(pp_dedup_spatial_adata, pp_dedup_spatial_adata_path)
|
|
11
|
+
"""
|
|
12
|
+
from ..readwrite import safe_read_h5ad, safe_write_h5ad, make_dirs, add_or_update_column_in_csv
|
|
13
|
+
from .load_adata import load_adata
|
|
14
|
+
from .preprocess_adata import preprocess_adata
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
17
|
+
import pandas as pd
|
|
18
|
+
import anndata as ad
|
|
19
|
+
import scanpy as sc
|
|
20
|
+
|
|
21
|
+
import os
|
|
22
|
+
from importlib import resources
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
|
|
25
|
+
from datetime import datetime
|
|
26
|
+
date_str = datetime.today().strftime("%y%m%d")
|
|
27
|
+
|
|
28
|
+
############################################### smftools load start ###############################################
|
|
29
|
+
adata, adata_path, cfg = load_adata(config_path)
|
|
30
|
+
# General config variable init - Necessary user passed inputs
|
|
31
|
+
smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
|
|
32
|
+
output_directory = Path(cfg.output_directory) # Path to the output directory to make for the analysis. Necessary.
|
|
33
|
+
# Make initial output directory
|
|
34
|
+
make_dirs([output_directory])
|
|
35
|
+
############################################### smftools load end ###############################################
|
|
36
|
+
|
|
37
|
+
############################################### smftools preprocess start ###############################################
|
|
38
|
+
pp_adata, pp_adata_path, pp_dedup_adata, pp_dup_rem_adata_path = preprocess_adata(config_path)
|
|
39
|
+
############################################### smftools preprocess end ###############################################
|
|
40
|
+
|
|
41
|
+
############################################### smftools spatial start ###############################################
|
|
42
|
+
input_manager_df = pd.read_csv(cfg.summary_file)
|
|
43
|
+
initial_adata_path = Path(input_manager_df['load_adata'][0])
|
|
44
|
+
pp_adata_path = Path(input_manager_df['pp_adata'][0])
|
|
45
|
+
pp_dup_rem_adata_path = Path(input_manager_df['pp_dedup_adata'][0])
|
|
46
|
+
spatial_adata_path = Path(input_manager_df['spatial_adata'][0])
|
|
47
|
+
hmm_adata_path = Path(input_manager_df['hmm_adata'][0])
|
|
48
|
+
|
|
49
|
+
if smf_modality == 'conversion':
|
|
50
|
+
deaminase = False
|
|
51
|
+
else:
|
|
52
|
+
deaminase = True
|
|
53
|
+
|
|
54
|
+
if pp_adata and pp_dedup_adata:
|
|
55
|
+
# This happens on first run of the preprocessing pipeline
|
|
56
|
+
first_pp_run = True
|
|
57
|
+
adata = pp_adata
|
|
58
|
+
adata_unique = pp_dedup_adata
|
|
59
|
+
else:
|
|
60
|
+
# If an anndata is saved, check which stages of the anndata are available
|
|
61
|
+
first_pp_run = False
|
|
62
|
+
initial_version_available = initial_adata_path.exists()
|
|
63
|
+
preprocessed_version_available = pp_adata_path.exists()
|
|
64
|
+
preprocessed_dup_removed_version_available = pp_dup_rem_adata_path.exists()
|
|
65
|
+
preprocessed_dedup_spatial_version_available = spatial_adata_path.exists()
|
|
66
|
+
hmm_version_available = hmm_adata_path.exists()
|
|
67
|
+
|
|
68
|
+
if cfg.force_redo_basic_analyses:
|
|
69
|
+
print(f"Forcing redo of basic analysis workflow, starting from the preprocessed adata if available. Otherwise, will use the raw adata.")
|
|
70
|
+
if preprocessed_dup_removed_version_available:
|
|
71
|
+
adata, load_report = safe_read_h5ad(pp_dup_rem_adata_path)
|
|
72
|
+
adata_version = "pp_dedup"
|
|
73
|
+
elif preprocessed_version_available:
|
|
74
|
+
adata, load_report = safe_read_h5ad(pp_adata_path)
|
|
75
|
+
adata_version = "pp"
|
|
76
|
+
elif initial_version_available:
|
|
77
|
+
adata, load_report = safe_read_h5ad(initial_adata_path)
|
|
78
|
+
adata_version = "initial"
|
|
79
|
+
else:
|
|
80
|
+
print(f"Can not redo duplicate detection when there is no compatible adata available: either raw or preprocessed are required")
|
|
81
|
+
return
|
|
82
|
+
elif preprocessed_dedup_spatial_version_available:
|
|
83
|
+
print(f"Preprocessed deduplicated spatial anndata found: {spatial_adata_path}")
|
|
84
|
+
return None, spatial_adata_path
|
|
85
|
+
elif preprocessed_dup_removed_version_available:
|
|
86
|
+
adata, load_report = safe_read_h5ad(pp_dup_rem_adata_path)
|
|
87
|
+
adata_version = "pp_dedup"
|
|
88
|
+
elif preprocessed_version_available:
|
|
89
|
+
adata, load_report = safe_read_h5ad(pp_adata_path)
|
|
90
|
+
adata_version = "pp"
|
|
91
|
+
elif initial_version_available:
|
|
92
|
+
adata, load_report = safe_read_h5ad(initial_adata_path)
|
|
93
|
+
adata_version = "initial"
|
|
94
|
+
else:
|
|
95
|
+
print(f"No adata available.")
|
|
96
|
+
return
|
|
97
|
+
|
|
98
|
+
pp_dir = output_directory / "preprocessed"
|
|
99
|
+
references = adata.obs[cfg.reference_column].cat.categories
|
|
100
|
+
|
|
101
|
+
if smf_modality != 'direct':
|
|
102
|
+
######### Clustermaps #########
|
|
103
|
+
if preprocessed_version_available:
|
|
104
|
+
pp_clustermap_dir = pp_dir / "06_clustermaps"
|
|
105
|
+
|
|
106
|
+
if pp_clustermap_dir.is_dir():
|
|
107
|
+
print(f'{pp_clustermap_dir} already exists. Skipping clustermap plotting.')
|
|
108
|
+
else:
|
|
109
|
+
from ..plotting import combined_raw_clustermap
|
|
110
|
+
make_dirs([pp_dir, pp_clustermap_dir])
|
|
111
|
+
|
|
112
|
+
if not first_pp_run:
|
|
113
|
+
pp_adata, load_report = safe_read_h5ad(pp_adata_path)
|
|
114
|
+
else:
|
|
115
|
+
pp_adata = adata
|
|
116
|
+
|
|
117
|
+
clustermap_results = combined_raw_clustermap(pp_adata,
|
|
118
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
119
|
+
reference_col=cfg.reference_column,
|
|
120
|
+
mod_target_bases=cfg.mod_target_bases,
|
|
121
|
+
layer_any_c=cfg.layer_for_clustermap_plotting,
|
|
122
|
+
layer_gpc=cfg.layer_for_clustermap_plotting,
|
|
123
|
+
layer_cpg=cfg.layer_for_clustermap_plotting,
|
|
124
|
+
layer_a=cfg.layer_for_clustermap_plotting,
|
|
125
|
+
cmap_any_c="coolwarm",
|
|
126
|
+
cmap_gpc="coolwarm",
|
|
127
|
+
cmap_cpg="viridis",
|
|
128
|
+
cmap_a="coolwarm",
|
|
129
|
+
min_quality=cfg.read_quality_filter_thresholds[0],
|
|
130
|
+
min_length=cfg.read_len_filter_thresholds[0],
|
|
131
|
+
min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[0],
|
|
132
|
+
min_position_valid_fraction=cfg.min_valid_fraction_positions_in_read_vs_ref,
|
|
133
|
+
bins=None,
|
|
134
|
+
sample_mapping=None,
|
|
135
|
+
save_path=pp_clustermap_dir,
|
|
136
|
+
sort_by='gpc',
|
|
137
|
+
deaminase=deaminase)
|
|
138
|
+
if first_pp_run:
|
|
139
|
+
adata = adata_unique
|
|
140
|
+
else:
|
|
141
|
+
pass
|
|
142
|
+
|
|
143
|
+
else:
|
|
144
|
+
pass
|
|
145
|
+
|
|
146
|
+
#### Proceed with dedeuplicated preprocessed anndata ###
|
|
147
|
+
pp_dir = pp_dir / "deduplicated"
|
|
148
|
+
pp_clustermap_dir = pp_dir / "06_clustermaps"
|
|
149
|
+
pp_umap_dir = pp_dir / "07_umaps"
|
|
150
|
+
|
|
151
|
+
if pp_clustermap_dir.is_dir():
|
|
152
|
+
print(f'{pp_clustermap_dir} already exists. Skipping clustermap plotting.')
|
|
153
|
+
else:
|
|
154
|
+
from ..plotting import combined_raw_clustermap
|
|
155
|
+
make_dirs([pp_dir, pp_clustermap_dir])
|
|
156
|
+
if smf_modality != 'direct':
|
|
157
|
+
sort_by = 'gpc'
|
|
158
|
+
else:
|
|
159
|
+
sort_by = 'any_a'
|
|
160
|
+
clustermap_results = combined_raw_clustermap(adata,
|
|
161
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
162
|
+
reference_col=cfg.reference_column,
|
|
163
|
+
mod_target_bases=cfg.mod_target_bases,
|
|
164
|
+
layer_any_c=cfg.layer_for_clustermap_plotting,
|
|
165
|
+
layer_gpc=cfg.layer_for_clustermap_plotting,
|
|
166
|
+
layer_cpg=cfg.layer_for_clustermap_plotting,
|
|
167
|
+
layer_a=cfg.layer_for_clustermap_plotting,
|
|
168
|
+
cmap_any_c="coolwarm",
|
|
169
|
+
cmap_gpc="coolwarm",
|
|
170
|
+
cmap_cpg="viridis",
|
|
171
|
+
cmap_a="coolwarm",
|
|
172
|
+
min_quality=cfg.read_quality_filter_thresholds[0],
|
|
173
|
+
min_length=cfg.read_len_filter_thresholds[0],
|
|
174
|
+
min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[0],
|
|
175
|
+
min_position_valid_fraction=1-cfg.position_max_nan_threshold,
|
|
176
|
+
bins=None,
|
|
177
|
+
sample_mapping=None,
|
|
178
|
+
save_path=pp_clustermap_dir,
|
|
179
|
+
sort_by=sort_by,
|
|
180
|
+
deaminase=deaminase)
|
|
181
|
+
|
|
182
|
+
######### PCA/UMAP/Leiden #########
|
|
183
|
+
if pp_umap_dir.is_dir():
|
|
184
|
+
print(f'{pp_umap_dir} already exists. Skipping UMAP plotting.')
|
|
185
|
+
else:
|
|
186
|
+
from ..tools import calculate_umap
|
|
187
|
+
make_dirs([pp_umap_dir])
|
|
188
|
+
|
|
189
|
+
var_filters = []
|
|
190
|
+
if smf_modality == 'direct':
|
|
191
|
+
for ref in references:
|
|
192
|
+
for base in cfg.mod_target_bases:
|
|
193
|
+
var_filters += [f'{ref}_{base}_site']
|
|
194
|
+
elif deaminase:
|
|
195
|
+
for ref in references:
|
|
196
|
+
var_filters += [f'{ref}_any_C_site']
|
|
197
|
+
else:
|
|
198
|
+
for ref in references:
|
|
199
|
+
for base in cfg.mod_target_bases:
|
|
200
|
+
var_filters += [f'{ref}_{base}_site']
|
|
201
|
+
|
|
202
|
+
adata = calculate_umap(adata,
|
|
203
|
+
layer=cfg.layer_for_umap_plotting,
|
|
204
|
+
var_filters=var_filters,
|
|
205
|
+
n_pcs=10,
|
|
206
|
+
knn_neighbors=15)
|
|
207
|
+
|
|
208
|
+
## Clustering
|
|
209
|
+
sc.tl.leiden(adata, resolution=0.1, flavor="igraph", n_iterations=2)
|
|
210
|
+
|
|
211
|
+
# Plotting UMAP
|
|
212
|
+
sc.settings.figdir = pp_umap_dir
|
|
213
|
+
umap_layers = ['leiden', cfg.sample_name_col_for_plotting, 'Reference_strand']
|
|
214
|
+
umap_layers += cfg.umap_layers_to_plot
|
|
215
|
+
sc.pl.umap(adata, color=umap_layers, show=False, save=True)
|
|
216
|
+
|
|
217
|
+
########## Spatial autocorrelation analyses ###########
|
|
218
|
+
from ..tools.spatial_autocorrelation import binary_autocorrelation_with_spacing, analyze_autocorr_matrix, bootstrap_periodicity, rolling_autocorr_metrics
|
|
219
|
+
from ..plotting import plot_rolling_grid
|
|
220
|
+
import warnings
|
|
221
|
+
|
|
222
|
+
pp_autocorr_dir = pp_dir / "08_autocorrelations"
|
|
223
|
+
|
|
224
|
+
if pp_autocorr_dir.is_dir():
|
|
225
|
+
print(f'{pp_autocorr_dir} already exists. Skipping autocorrelation plotting.')
|
|
226
|
+
else:
|
|
227
|
+
positions = adata.var_names.astype(int).values
|
|
228
|
+
lags = np.arange(cfg.autocorr_max_lag + 1)
|
|
229
|
+
|
|
230
|
+
# optional: try to parallelize autocorr per-row with joblib
|
|
231
|
+
try:
|
|
232
|
+
from joblib import Parallel, delayed
|
|
233
|
+
_have_joblib = True
|
|
234
|
+
except Exception:
|
|
235
|
+
_have_joblib = False
|
|
236
|
+
|
|
237
|
+
for site_type in cfg.autocorr_site_types:
|
|
238
|
+
layer_key = f"{site_type}_site_binary"
|
|
239
|
+
if layer_key not in adata.layers:
|
|
240
|
+
print(f"Layer {layer_key} not found in adata.layers — skipping {site_type}.")
|
|
241
|
+
continue
|
|
242
|
+
|
|
243
|
+
X = adata.layers[layer_key]
|
|
244
|
+
if getattr(X, "shape", (0,))[0] == 0:
|
|
245
|
+
print(f"Layer {layer_key} empty — skipping {site_type}.")
|
|
246
|
+
continue
|
|
247
|
+
|
|
248
|
+
# compute per-molecule autocorrs (and counts)
|
|
249
|
+
rows = []
|
|
250
|
+
counts = []
|
|
251
|
+
if _have_joblib:
|
|
252
|
+
# parallel map
|
|
253
|
+
def _worker(row):
|
|
254
|
+
try:
|
|
255
|
+
ac, cnts = binary_autocorrelation_with_spacing(
|
|
256
|
+
row, positions, max_lag=cfg.autocorr_max_lag, return_counts=True
|
|
257
|
+
)
|
|
258
|
+
except Exception as e:
|
|
259
|
+
# on error return NaN arrays
|
|
260
|
+
ac = np.full(cfg.autocorr_max_lag + 1, np.nan, dtype=np.float32)
|
|
261
|
+
cnts = np.zeros(cfg.autocorr_max_lag + 1, dtype=np.int32)
|
|
262
|
+
return ac, cnts
|
|
263
|
+
|
|
264
|
+
res = Parallel(n_jobs=cfg.n_jobs if hasattr(cfg, "n_jobs") else -1)(
|
|
265
|
+
delayed(_worker)(X[i]) for i in range(X.shape[0])
|
|
266
|
+
)
|
|
267
|
+
for ac, cnts in res:
|
|
268
|
+
rows.append(ac)
|
|
269
|
+
counts.append(cnts)
|
|
270
|
+
else:
|
|
271
|
+
# sequential fallback
|
|
272
|
+
for i in range(X.shape[0]):
|
|
273
|
+
ac, cnts = binary_autocorrelation_with_spacing(
|
|
274
|
+
X[i], positions, max_lag=cfg.autocorr_max_lag, return_counts=True
|
|
275
|
+
)
|
|
276
|
+
rows.append(ac)
|
|
277
|
+
counts.append(cnts)
|
|
278
|
+
|
|
279
|
+
autocorr_matrix = np.asarray(rows, dtype=np.float32)
|
|
280
|
+
counts_matrix = np.asarray(counts, dtype=np.int32)
|
|
281
|
+
|
|
282
|
+
# store raw per-molecule arrays (keep memory format compact)
|
|
283
|
+
adata.obsm[f"{site_type}_spatial_autocorr"] = autocorr_matrix
|
|
284
|
+
adata.obsm[f"{site_type}_spatial_autocorr_counts"] = counts_matrix
|
|
285
|
+
adata.uns[f"{site_type}_spatial_autocorr_lags"] = lags
|
|
286
|
+
|
|
287
|
+
# compute global periodicity metrics across all molecules for this site_type
|
|
288
|
+
try:
|
|
289
|
+
results = analyze_autocorr_matrix(
|
|
290
|
+
autocorr_matrix, counts_matrix, lags,
|
|
291
|
+
nrl_search_bp=(120, 260), pad_factor=4, min_count=20, max_harmonics=6
|
|
292
|
+
)
|
|
293
|
+
except Exception as e:
|
|
294
|
+
results = {"error": str(e)}
|
|
295
|
+
|
|
296
|
+
# store global metrics (same keys you used)
|
|
297
|
+
global_metrics = {
|
|
298
|
+
"nrl_bp": results.get("nrl_bp", np.nan),
|
|
299
|
+
"xi": results.get("xi", np.nan),
|
|
300
|
+
"snr": results.get("snr", np.nan),
|
|
301
|
+
"fwhm_bp": results.get("fwhm_bp", np.nan),
|
|
302
|
+
"envelope_sample_lags": results.get("envelope_sample_lags", np.array([])).tolist(),
|
|
303
|
+
"envelope_heights": results.get("envelope_heights", np.array([])).tolist(),
|
|
304
|
+
"analyzer_error": results.get("error", None),
|
|
305
|
+
}
|
|
306
|
+
adata.uns[f"{site_type}_spatial_periodicity_metrics"] = global_metrics
|
|
307
|
+
|
|
308
|
+
# bootstrap for CI (use a reasonable default; set low only for debugging)
|
|
309
|
+
n_boot = getattr(cfg, "autocorr_bootstrap_n", 200)
|
|
310
|
+
# if user intentionally set very low n_boot in cfg, we keep that; otherwise default 200
|
|
311
|
+
try:
|
|
312
|
+
bs = bootstrap_periodicity(
|
|
313
|
+
autocorr_matrix, counts_matrix, lags,
|
|
314
|
+
n_boot=n_boot, nrl_search_bp=(120, 260), pad_factor=4, min_count=20
|
|
315
|
+
)
|
|
316
|
+
adata.uns[f"{site_type}_spatial_periodicity_boot"] = {
|
|
317
|
+
"nrl_boot": np.asarray(bs["nrl_boot"]).tolist(),
|
|
318
|
+
"xi_boot": np.asarray(bs["xi_boot"]).tolist(),
|
|
319
|
+
}
|
|
320
|
+
except Exception as e:
|
|
321
|
+
adata.uns[f"{site_type}_spatial_periodicity_boot_error"] = str(e)
|
|
322
|
+
|
|
323
|
+
# ----------------------------
|
|
324
|
+
# Compute group-level metrics for plotting (per sample × reference)
|
|
325
|
+
# ----------------------------
|
|
326
|
+
metrics_by_group = {}
|
|
327
|
+
sample_col = cfg.sample_name_col_for_plotting
|
|
328
|
+
ref_col = cfg.reference_strand_col if hasattr(cfg, "reference_strand_col") else "Reference_strand"
|
|
329
|
+
samples = adata.obs[sample_col].astype("category").cat.categories.tolist()
|
|
330
|
+
refs = adata.obs[ref_col].astype("category").cat.categories.tolist()
|
|
331
|
+
|
|
332
|
+
# iterate groups and run analyzer on each group's subset; cache errors
|
|
333
|
+
for sample_name in samples:
|
|
334
|
+
sample_mask = (adata.obs[sample_col].values == sample_name)
|
|
335
|
+
# combined group
|
|
336
|
+
mask = sample_mask
|
|
337
|
+
ac_sel = autocorr_matrix[mask, :]
|
|
338
|
+
cnt_sel = counts_matrix[mask, :] if counts_matrix is not None else None
|
|
339
|
+
if ac_sel.size:
|
|
340
|
+
try:
|
|
341
|
+
r = analyze_autocorr_matrix(ac_sel, cnt_sel if cnt_sel is not None else np.zeros_like(ac_sel, dtype=int),
|
|
342
|
+
lags, nrl_search_bp=(120,260), pad_factor=4, min_count=10, max_harmonics=6)
|
|
343
|
+
except Exception as e:
|
|
344
|
+
r = {"error": str(e)}
|
|
345
|
+
else:
|
|
346
|
+
r = {"error": "no_data"}
|
|
347
|
+
metrics_by_group[(sample_name, None)] = r
|
|
348
|
+
|
|
349
|
+
# per-reference groups
|
|
350
|
+
for ref in refs:
|
|
351
|
+
mask_ref = sample_mask & (adata.obs[ref_col].values == ref)
|
|
352
|
+
ac_sel = autocorr_matrix[mask_ref, :]
|
|
353
|
+
cnt_sel = counts_matrix[mask_ref, :] if counts_matrix is not None else None
|
|
354
|
+
if ac_sel.size:
|
|
355
|
+
try:
|
|
356
|
+
r = analyze_autocorr_matrix(ac_sel, cnt_sel if cnt_sel is not None else np.zeros_like(ac_sel, dtype=int),
|
|
357
|
+
lags, nrl_search_bp=(120,260), pad_factor=4, min_count=10, max_harmonics=6)
|
|
358
|
+
except Exception as e:
|
|
359
|
+
r = {"error": str(e)}
|
|
360
|
+
else:
|
|
361
|
+
r = {"error": "no_data"}
|
|
362
|
+
metrics_by_group[(sample_name, ref)] = r
|
|
363
|
+
|
|
364
|
+
# persist group metrics
|
|
365
|
+
adata.uns[f"{site_type}_spatial_periodicity_metrics_by_group"] = metrics_by_group
|
|
366
|
+
|
|
367
|
+
global_nrl = adata.uns.get(f"{site_type}_spatial_periodicity_metrics", {}).get("nrl_bp", None)
|
|
368
|
+
|
|
369
|
+
# configuration / sensible defaults (override in cfg if present)
|
|
370
|
+
rolling_cfg = {
|
|
371
|
+
"window_size": getattr(cfg, "rolling_window_size", getattr(cfg, "autocorr_rolling_window_size", 600)),
|
|
372
|
+
"step": getattr(cfg, "rolling_step", 100),
|
|
373
|
+
"max_lag": getattr(cfg, "rolling_max_lag", cfg.autocorr_max_lag if hasattr(cfg, "autocorr_max_lag") else 500),
|
|
374
|
+
"min_molecules_per_window": getattr(cfg, "rolling_min_molecules_per_window", 10),
|
|
375
|
+
"nrl_search_bp": getattr(cfg, "rolling_nrl_search_bp", (120, 240)),
|
|
376
|
+
"pad_factor": getattr(cfg, "rolling_pad_factor", 4),
|
|
377
|
+
"min_count_for_mean": getattr(cfg, "rolling_min_count_for_mean", 10),
|
|
378
|
+
"max_harmonics": getattr(cfg, "rolling_max_harmonics", 6),
|
|
379
|
+
"n_jobs": getattr(cfg, "rolling_n_jobs", 4),
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
write_plots = getattr(cfg, "rolling_write_plots", True)
|
|
383
|
+
write_csvs = getattr(cfg, "rolling_write_csvs", True)
|
|
384
|
+
min_molecules_for_group = getattr(cfg, "rolling_min_molecules_for_group", 30) # only run rolling if group has >= this many molecules
|
|
385
|
+
|
|
386
|
+
rolling_out_dir = os.path.join(pp_autocorr_dir, "rolling_metrics")
|
|
387
|
+
os.makedirs(rolling_out_dir, exist_ok=True)
|
|
388
|
+
# also a per-site subfolder
|
|
389
|
+
site_out_dir = os.path.join(rolling_out_dir, site_type)
|
|
390
|
+
os.makedirs(site_out_dir, exist_ok=True)
|
|
391
|
+
|
|
392
|
+
combined_rows = [] # accumulate one row per window for combined CSV
|
|
393
|
+
rolling_results_by_group = {} # store DataFrame per group in memory (persist later to adata.uns)
|
|
394
|
+
|
|
395
|
+
# iterate groups (samples × refs). `samples` and `refs` were computed above.
|
|
396
|
+
for sample_name in samples:
|
|
397
|
+
sample_mask = (adata.obs[sample_col].values == sample_name)
|
|
398
|
+
# first the combined group ("all refs")
|
|
399
|
+
group_masks = [("all", sample_mask)]
|
|
400
|
+
# then per-reference groups
|
|
401
|
+
for ref in refs:
|
|
402
|
+
ref_mask = sample_mask & (adata.obs[ref_col].values == ref)
|
|
403
|
+
group_masks.append((ref, ref_mask))
|
|
404
|
+
|
|
405
|
+
for ref_label, mask in group_masks:
|
|
406
|
+
n_group = int(mask.sum())
|
|
407
|
+
if n_group < min_molecules_for_group:
|
|
408
|
+
# skip tiny groups
|
|
409
|
+
if cfg.get("verbosity", 0) if hasattr(cfg, "get") else False:
|
|
410
|
+
print(f"Skipping rolling for {site_type} {sample_name} {ref_label}: only {n_group} molecules (<{min_molecules_for_group})")
|
|
411
|
+
# still write an empty CSV row set if desired; here we skip
|
|
412
|
+
continue
|
|
413
|
+
|
|
414
|
+
# extract group matrix X_group (works with dense or sparse adata.layers)
|
|
415
|
+
X_group = X[mask, :]
|
|
416
|
+
# positions already set above
|
|
417
|
+
try:
|
|
418
|
+
# call your rolling function (this may be slow; it uses cfg.n_jobs)
|
|
419
|
+
df_roll = rolling_autocorr_metrics(
|
|
420
|
+
X_group,
|
|
421
|
+
positions,
|
|
422
|
+
site_label=site_type,
|
|
423
|
+
window_size=rolling_cfg["window_size"],
|
|
424
|
+
step=rolling_cfg["step"],
|
|
425
|
+
max_lag=rolling_cfg["max_lag"],
|
|
426
|
+
min_molecules_per_window=rolling_cfg["min_molecules_per_window"],
|
|
427
|
+
nrl_search_bp=rolling_cfg["nrl_search_bp"],
|
|
428
|
+
pad_factor=rolling_cfg["pad_factor"],
|
|
429
|
+
min_count_for_mean=rolling_cfg["min_count_for_mean"],
|
|
430
|
+
max_harmonics=rolling_cfg["max_harmonics"],
|
|
431
|
+
n_jobs=rolling_cfg["n_jobs"],
|
|
432
|
+
verbose=False,
|
|
433
|
+
fixed_nrl_bp=global_nrl
|
|
434
|
+
)
|
|
435
|
+
except Exception as e:
|
|
436
|
+
warnings.warn(f"rolling_autocorr_metrics failed for {site_type} {sample_name} {ref_label}: {e}")
|
|
437
|
+
continue
|
|
438
|
+
|
|
439
|
+
# normalize column names and keep only the compact set you want
|
|
440
|
+
# keep: center, n_molecules, nrl_bp, snr, xi, fwhm_bp
|
|
441
|
+
if "center" not in df_roll.columns:
|
|
442
|
+
# defensive: if the rolling function returned different schema, skip
|
|
443
|
+
warnings.warn(f"rolling_autocorr_metrics returned unexpected schema for {site_type} {sample_name} {ref_label}")
|
|
444
|
+
continue
|
|
445
|
+
|
|
446
|
+
compact_df = df_roll[["center", "n_molecules", "nrl_bp", "snr", "xi", "fwhm_bp"]].copy()
|
|
447
|
+
compact_df["site"] = site_type
|
|
448
|
+
compact_df["sample"] = sample_name
|
|
449
|
+
compact_df["reference"] = ref_label if ref_label != "all" else "all"
|
|
450
|
+
|
|
451
|
+
# save per-group CSV
|
|
452
|
+
if write_csvs:
|
|
453
|
+
safe_sample = str(sample_name).replace(os.sep, "_")
|
|
454
|
+
safe_ref = str(ref_label if ref_label != "all" else "all").replace(os.sep, "_")
|
|
455
|
+
out_csv = os.path.join(site_out_dir, f"{safe_sample}__{safe_ref}__rolling_metrics.csv")
|
|
456
|
+
try:
|
|
457
|
+
compact_df.to_csv(out_csv, index=False)
|
|
458
|
+
except Exception as e:
|
|
459
|
+
warnings.warn(f"Failed to write rolling CSV {out_csv}: {e}")
|
|
460
|
+
|
|
461
|
+
# save a plot per-group (NRL and SNR vs center)
|
|
462
|
+
if write_plots:
|
|
463
|
+
try:
|
|
464
|
+
# use your plot helper; if it's in a different module, import accordingly
|
|
465
|
+
from ..plotting import plot_rolling_metrics as _plot_roll
|
|
466
|
+
except Exception:
|
|
467
|
+
_plot_roll = globals().get("plot_rolling_metrics", None)
|
|
468
|
+
if _plot_roll is not None:
|
|
469
|
+
plot_png = os.path.join(site_out_dir, f"{safe_sample}__{safe_ref}__rolling_metrics.png")
|
|
470
|
+
try:
|
|
471
|
+
_plot_roll(compact_df, out_png=plot_png,
|
|
472
|
+
title=f"{site_type} {sample_name} {ref_label}",
|
|
473
|
+
figsize=(10,3.5), dpi=160, show=False)
|
|
474
|
+
except Exception as e:
|
|
475
|
+
warnings.warn(f"Failed to create rolling plot for {site_type} {sample_name} {ref_label}: {e}")
|
|
476
|
+
|
|
477
|
+
# store in combined_rows and in-memory dict
|
|
478
|
+
combined_rows.append(compact_df.assign(site=site_type, sample=sample_name, reference=ref_label))
|
|
479
|
+
rolling_results_by_group[(sample_name, None if ref_label == "all" else ref_label)] = compact_df
|
|
480
|
+
|
|
481
|
+
# persist per-site rolling metrics into adata.uns as dict of DataFrames (or empty dict)
|
|
482
|
+
adata.uns[f"{site_type}_rolling_metrics_by_group"] = rolling_results_by_group
|
|
483
|
+
|
|
484
|
+
# write combined CSV for this site across all groups
|
|
485
|
+
if len(combined_rows):
|
|
486
|
+
combined_df_site = pd.concat(combined_rows, ignore_index=True, sort=False)
|
|
487
|
+
combined_out_csv = os.path.join(rolling_out_dir, f"{site_type}__rolling_metrics_combined.csv")
|
|
488
|
+
try:
|
|
489
|
+
combined_df_site.to_csv(combined_out_csv, index=False)
|
|
490
|
+
except Exception as e:
|
|
491
|
+
warnings.warn(f"Failed to write combined rolling CSV for {site_type}: {e}")
|
|
492
|
+
|
|
493
|
+
rolling_dict = adata.uns[f"{site_type}_rolling_metrics_by_group"]
|
|
494
|
+
plot_out_dir = os.path.join(pp_autocorr_dir, "rolling_plots")
|
|
495
|
+
os.makedirs(plot_out_dir, exist_ok=True)
|
|
496
|
+
pages = plot_rolling_grid(rolling_dict, plot_out_dir, site_type,
|
|
497
|
+
rows_per_page=cfg.rows_per_qc_autocorr_grid,
|
|
498
|
+
cols_per_page=len(refs),
|
|
499
|
+
dpi=160,
|
|
500
|
+
metrics=("nrl_bp","snr", "xi"),
|
|
501
|
+
per_metric_ylim={"snr": (0, 25)})
|
|
502
|
+
|
|
503
|
+
from ..plotting import plot_spatial_autocorr_grid
|
|
504
|
+
make_dirs([pp_autocorr_dir, pp_autocorr_dir])
|
|
505
|
+
|
|
506
|
+
plot_spatial_autocorr_grid(adata,
|
|
507
|
+
pp_autocorr_dir,
|
|
508
|
+
site_types=cfg.autocorr_site_types,
|
|
509
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
510
|
+
window=cfg.autocorr_rolling_window_size,
|
|
511
|
+
rows_per_fig=cfg.rows_per_qc_autocorr_grid)
|
|
512
|
+
|
|
513
|
+
############ Pearson analyses ###############
|
|
514
|
+
if smf_modality != 'direct':
|
|
515
|
+
from ..tools.position_stats import compute_positionwise_statistics, plot_positionwise_matrices
|
|
516
|
+
|
|
517
|
+
pp_corr_dir = pp_dir / "09_correlation_matrices"
|
|
518
|
+
|
|
519
|
+
if pp_corr_dir.is_dir():
|
|
520
|
+
print(f'{pp_corr_dir} already exists. Skipping correlation matrix plotting.')
|
|
521
|
+
else:
|
|
522
|
+
compute_positionwise_statistics(
|
|
523
|
+
adata,
|
|
524
|
+
layer="nan0_0minus1",
|
|
525
|
+
methods=cfg.correlation_matrix_types,
|
|
526
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
527
|
+
ref_col=cfg.reference_column,
|
|
528
|
+
output_key="positionwise_result",
|
|
529
|
+
site_types=cfg.correlation_matrix_site_types,
|
|
530
|
+
encoding="signed",
|
|
531
|
+
max_threads=cfg.threads,
|
|
532
|
+
min_count_for_pairwise=10,
|
|
533
|
+
)
|
|
534
|
+
|
|
535
|
+
plot_positionwise_matrices(
|
|
536
|
+
adata,
|
|
537
|
+
methods=cfg.correlation_matrix_types,
|
|
538
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
539
|
+
ref_col=cfg.reference_column,
|
|
540
|
+
figsize_per_cell=(4.0, 3.0),
|
|
541
|
+
dpi=160,
|
|
542
|
+
cmaps=cfg.correlation_matrix_cmaps,
|
|
543
|
+
vmin=None,
|
|
544
|
+
vmax=None,
|
|
545
|
+
output_dir=pp_corr_dir,
|
|
546
|
+
output_key= "positionwise_result"
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
####### Save basic analysis adata - post preprocessing and duplicate removal ################
|
|
550
|
+
from ..readwrite import safe_write_h5ad
|
|
551
|
+
if not spatial_adata_path.exists() or cfg.force_redo_preprocessing:
|
|
552
|
+
print('Saving spatial analyzed adata post preprocessing and duplicate removal')
|
|
553
|
+
if ".gz" == spatial_adata_path.suffix:
|
|
554
|
+
print(f"Spatial adata path: {spatial_adata_path}")
|
|
555
|
+
safe_write_h5ad(adata, spatial_adata_path, compression='gzip', backup=True)
|
|
556
|
+
else:
|
|
557
|
+
spatial_adata_path = spatial_adata_path.with_name(spatial_adata_path.name + '.gz')
|
|
558
|
+
print(f"Spatial adata path: {spatial_adata_path}")
|
|
559
|
+
safe_write_h5ad(adata, spatial_adata_path, compression='gzip', backup=True)
|
|
560
|
+
############################################### smftools spatial end ###############################################
|
|
561
|
+
|
|
562
|
+
add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", spatial_adata_path)
|
|
563
|
+
|
|
564
|
+
return adata, spatial_adata_path
|