smftools 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/_version.py +1 -1
- smftools/cli/chimeric_adata.py +1563 -0
- smftools/cli/helpers.py +18 -2
- smftools/cli/hmm_adata.py +18 -1
- smftools/cli/latent_adata.py +522 -67
- smftools/cli/load_adata.py +2 -2
- smftools/cli/preprocess_adata.py +32 -93
- smftools/cli/recipes.py +26 -0
- smftools/cli/spatial_adata.py +23 -109
- smftools/cli/variant_adata.py +423 -0
- smftools/cli_entry.py +41 -5
- smftools/config/conversion.yaml +0 -10
- smftools/config/deaminase.yaml +3 -0
- smftools/config/default.yaml +49 -13
- smftools/config/experiment_config.py +96 -3
- smftools/constants.py +4 -0
- smftools/hmm/call_hmm_peaks.py +1 -1
- smftools/informatics/binarize_converted_base_identities.py +2 -89
- smftools/informatics/converted_BAM_to_adata.py +53 -13
- smftools/informatics/h5ad_functions.py +83 -0
- smftools/informatics/modkit_extract_to_adata.py +4 -0
- smftools/plotting/__init__.py +26 -12
- smftools/plotting/autocorrelation_plotting.py +22 -4
- smftools/plotting/chimeric_plotting.py +1893 -0
- smftools/plotting/classifiers.py +28 -14
- smftools/plotting/general_plotting.py +58 -3362
- smftools/plotting/hmm_plotting.py +1586 -2
- smftools/plotting/latent_plotting.py +804 -0
- smftools/plotting/plotting_utils.py +243 -0
- smftools/plotting/position_stats.py +16 -8
- smftools/plotting/preprocess_plotting.py +281 -0
- smftools/plotting/qc_plotting.py +8 -3
- smftools/plotting/spatial_plotting.py +1134 -0
- smftools/plotting/variant_plotting.py +1231 -0
- smftools/preprocessing/__init__.py +3 -0
- smftools/preprocessing/append_base_context.py +1 -1
- smftools/preprocessing/append_mismatch_frequency_sites.py +35 -6
- smftools/preprocessing/append_sequence_mismatch_annotations.py +171 -0
- smftools/preprocessing/append_variant_call_layer.py +480 -0
- smftools/preprocessing/flag_duplicate_reads.py +4 -4
- smftools/preprocessing/invert_adata.py +1 -0
- smftools/readwrite.py +109 -85
- smftools/tools/__init__.py +6 -0
- smftools/tools/calculate_knn.py +121 -0
- smftools/tools/calculate_nmf.py +18 -7
- smftools/tools/calculate_pca.py +180 -0
- smftools/tools/calculate_umap.py +70 -154
- smftools/tools/position_stats.py +4 -4
- smftools/tools/rolling_nn_distance.py +640 -3
- smftools/tools/sequence_alignment.py +140 -0
- smftools/tools/tensor_factorization.py +52 -4
- {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/METADATA +3 -1
- {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/RECORD +56 -42
- {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/WHEEL +0 -0
- {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/entry_points.txt +0 -0
- {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,1893 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from math import floor
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Sequence
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
from smftools.logging_utils import get_logger
|
|
12
|
+
from smftools.optional_imports import require
|
|
13
|
+
from smftools.plotting.plotting_utils import (
|
|
14
|
+
_methylation_fraction_for_layer,
|
|
15
|
+
clean_barplot,
|
|
16
|
+
make_row_colors,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
plt = require("matplotlib.pyplot", extra="plotting", purpose="plot rendering")
|
|
20
|
+
colors = require("matplotlib.colors", extra="plotting", purpose="plot rendering")
|
|
21
|
+
grid_spec = require("matplotlib.gridspec", extra="plotting", purpose="heatmap plotting")
|
|
22
|
+
sns = require("seaborn", extra="plotting", purpose="plot styling")
|
|
23
|
+
|
|
24
|
+
logger = get_logger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def plot_rolling_nn_and_layer(
|
|
28
|
+
subset,
|
|
29
|
+
obsm_key: str = "rolling_nn_dist",
|
|
30
|
+
layer_key: str = "nan0_0minus1",
|
|
31
|
+
meta_cols: tuple[str, ...] = ("Reference_strand", "Sample"),
|
|
32
|
+
col_cluster: bool = False,
|
|
33
|
+
fill_nn_with_colmax: bool = True,
|
|
34
|
+
fill_layer_value: float = 0.0,
|
|
35
|
+
drop_all_nan_windows: bool = True,
|
|
36
|
+
max_nan_fraction: float | None = None,
|
|
37
|
+
var_valid_fraction_col: str | None = None,
|
|
38
|
+
var_nan_fraction_col: str | None = None,
|
|
39
|
+
read_span_layer: str | None = "read_span_mask",
|
|
40
|
+
outside_read_color: str = "#bdbdbd",
|
|
41
|
+
nn_nan_color: str = "#bdbdbd",
|
|
42
|
+
figsize: tuple[float, float] = (14, 10),
|
|
43
|
+
right_panel_var_mask=None, # optional boolean mask over subset.var to reduce width
|
|
44
|
+
robust: bool = True,
|
|
45
|
+
title: str | None = None,
|
|
46
|
+
xtick_step: int | None = None,
|
|
47
|
+
xtick_rotation: int = 90,
|
|
48
|
+
xtick_fontsize: int = 8,
|
|
49
|
+
save_name: str | None = None,
|
|
50
|
+
):
|
|
51
|
+
"""
|
|
52
|
+
1) Cluster rows by subset.obsm[obsm_key] (rolling NN distances)
|
|
53
|
+
2) Plot two heatmaps side-by-side in the SAME row order, with mean barplots above:
|
|
54
|
+
- left: rolling NN distance matrix
|
|
55
|
+
- right: subset.layers[layer_key] matrix
|
|
56
|
+
|
|
57
|
+
Handles categorical/MultiIndex issues in metadata coloring.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
subset: AnnData subset with rolling NN distances stored in ``obsm``.
|
|
61
|
+
obsm_key: Key in ``subset.obsm`` containing rolling NN distances.
|
|
62
|
+
layer_key: Layer name to plot alongside rolling NN distances.
|
|
63
|
+
meta_cols: Obs columns used for row color annotations.
|
|
64
|
+
col_cluster: Whether to cluster columns in the rolling NN clustermap.
|
|
65
|
+
fill_nn_with_colmax: Fill NaNs in rolling NN distances with per-column max values.
|
|
66
|
+
fill_layer_value: Fill NaNs in the layer heatmap with this value.
|
|
67
|
+
drop_all_nan_windows: Drop rolling windows that are all NaN.
|
|
68
|
+
max_nan_fraction: Maximum allowed NaN fraction per position (filtering columns).
|
|
69
|
+
var_valid_fraction_col: ``subset.var`` column with valid fractions (1 - NaN fraction).
|
|
70
|
+
var_nan_fraction_col: ``subset.var`` column with NaN fractions.
|
|
71
|
+
read_span_layer: Layer name with read span mask; 0 values are treated as outside read.
|
|
72
|
+
outside_read_color: Color used to show positions outside each read.
|
|
73
|
+
nn_nan_color: Color used for NaNs in the rolling NN heatmap.
|
|
74
|
+
figsize: Figure size for the combined plot.
|
|
75
|
+
right_panel_var_mask: Optional boolean mask over ``subset.var`` for the right panel.
|
|
76
|
+
robust: Use robust color scaling in seaborn.
|
|
77
|
+
title: Optional figure title (suptitle).
|
|
78
|
+
xtick_step: Spacing between x-axis tick labels.
|
|
79
|
+
xtick_rotation: Rotation for x-axis tick labels.
|
|
80
|
+
xtick_fontsize: Font size for x-axis tick labels.
|
|
81
|
+
save_name: Optional output path for saving the plot.
|
|
82
|
+
"""
|
|
83
|
+
if max_nan_fraction is not None and not (0 <= max_nan_fraction <= 1):
|
|
84
|
+
raise ValueError("max_nan_fraction must be between 0 and 1.")
|
|
85
|
+
|
|
86
|
+
logger.info("Plotting rolling NN distances with layer '%s'.", layer_key)
|
|
87
|
+
|
|
88
|
+
def _apply_xticks(ax, labels, step):
|
|
89
|
+
if labels is None or len(labels) == 0:
|
|
90
|
+
ax.set_xticks([])
|
|
91
|
+
return
|
|
92
|
+
if step is None or step <= 0:
|
|
93
|
+
step = max(1, len(labels) // 10)
|
|
94
|
+
ticks = np.arange(0, len(labels), step)
|
|
95
|
+
ax.set_xticks(ticks + 0.5)
|
|
96
|
+
ax.set_xticklabels(
|
|
97
|
+
[labels[i] for i in ticks],
|
|
98
|
+
rotation=xtick_rotation,
|
|
99
|
+
fontsize=xtick_fontsize,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
def _format_labels(values):
|
|
103
|
+
values = np.asarray(values)
|
|
104
|
+
if np.issubdtype(values.dtype, np.number):
|
|
105
|
+
if np.all(np.isfinite(values)) and np.all(np.isclose(values, np.round(values))):
|
|
106
|
+
values = np.round(values).astype(int)
|
|
107
|
+
return [str(v) for v in values]
|
|
108
|
+
|
|
109
|
+
X = subset.obsm[obsm_key]
|
|
110
|
+
valid = ~np.all(np.isnan(X), axis=1)
|
|
111
|
+
|
|
112
|
+
X_df = pd.DataFrame(X[valid], index=subset.obs_names[valid])
|
|
113
|
+
|
|
114
|
+
if drop_all_nan_windows:
|
|
115
|
+
X_df = X_df.loc[:, ~X_df.isna().all(axis=0)]
|
|
116
|
+
|
|
117
|
+
col_max = X_df.max(axis=0, skipna=True).fillna(0)
|
|
118
|
+
X_df_cluster = X_df.fillna(col_max)
|
|
119
|
+
X_df_cluster.index = X_df_cluster.index.astype(str)
|
|
120
|
+
if fill_nn_with_colmax:
|
|
121
|
+
X_df_display = X_df_cluster
|
|
122
|
+
else:
|
|
123
|
+
X_df_display = X_df.copy()
|
|
124
|
+
X_df_display.index = X_df_display.index.astype(str)
|
|
125
|
+
|
|
126
|
+
meta = subset.obs.loc[X_df_cluster.index, list(meta_cols)].copy()
|
|
127
|
+
meta.index = meta.index.astype(str)
|
|
128
|
+
row_colors = make_row_colors(meta)
|
|
129
|
+
|
|
130
|
+
g = sns.clustermap(
|
|
131
|
+
X_df_cluster,
|
|
132
|
+
cmap="viridis",
|
|
133
|
+
col_cluster=col_cluster,
|
|
134
|
+
row_cluster=True,
|
|
135
|
+
row_colors=row_colors,
|
|
136
|
+
xticklabels=False,
|
|
137
|
+
yticklabels=False,
|
|
138
|
+
robust=robust,
|
|
139
|
+
)
|
|
140
|
+
row_order = g.dendrogram_row.reordered_ind
|
|
141
|
+
ordered_index = X_df_cluster.index[row_order]
|
|
142
|
+
plt.close(g.fig)
|
|
143
|
+
|
|
144
|
+
X_ord = X_df_display.loc[ordered_index]
|
|
145
|
+
|
|
146
|
+
L = subset.layers[layer_key]
|
|
147
|
+
L = L.toarray() if hasattr(L, "toarray") else np.asarray(L)
|
|
148
|
+
|
|
149
|
+
L_df = pd.DataFrame(L[valid], index=subset.obs_names[valid], columns=subset.var_names)
|
|
150
|
+
L_df.index = L_df.index.astype(str)
|
|
151
|
+
|
|
152
|
+
if right_panel_var_mask is not None:
|
|
153
|
+
if hasattr(right_panel_var_mask, "values"):
|
|
154
|
+
right_panel_var_mask = right_panel_var_mask.values
|
|
155
|
+
right_panel_var_mask = np.asarray(right_panel_var_mask, dtype=bool)
|
|
156
|
+
|
|
157
|
+
if max_nan_fraction is not None:
|
|
158
|
+
nan_fraction = None
|
|
159
|
+
if var_nan_fraction_col and var_nan_fraction_col in subset.var:
|
|
160
|
+
nan_fraction = pd.to_numeric(
|
|
161
|
+
subset.var[var_nan_fraction_col], errors="coerce"
|
|
162
|
+
).to_numpy()
|
|
163
|
+
elif var_valid_fraction_col and var_valid_fraction_col in subset.var:
|
|
164
|
+
valid_fraction = pd.to_numeric(
|
|
165
|
+
subset.var[var_valid_fraction_col], errors="coerce"
|
|
166
|
+
).to_numpy()
|
|
167
|
+
nan_fraction = 1 - valid_fraction
|
|
168
|
+
if nan_fraction is not None:
|
|
169
|
+
nan_mask = nan_fraction <= max_nan_fraction
|
|
170
|
+
if right_panel_var_mask is None:
|
|
171
|
+
right_panel_var_mask = nan_mask
|
|
172
|
+
else:
|
|
173
|
+
right_panel_var_mask = right_panel_var_mask & nan_mask
|
|
174
|
+
|
|
175
|
+
if right_panel_var_mask is not None:
|
|
176
|
+
if right_panel_var_mask.size != L_df.shape[1]:
|
|
177
|
+
raise ValueError("right_panel_var_mask must align with subset.var_names.")
|
|
178
|
+
L_df = L_df.loc[:, right_panel_var_mask]
|
|
179
|
+
|
|
180
|
+
read_span_mask = None
|
|
181
|
+
if read_span_layer and read_span_layer in subset.layers:
|
|
182
|
+
span = subset.layers[read_span_layer]
|
|
183
|
+
span = span.toarray() if hasattr(span, "toarray") else np.asarray(span)
|
|
184
|
+
span_df = pd.DataFrame(span[valid], index=subset.obs_names[valid], columns=subset.var_names)
|
|
185
|
+
span_df.index = span_df.index.astype(str)
|
|
186
|
+
if right_panel_var_mask is not None:
|
|
187
|
+
span_df = span_df.loc[:, right_panel_var_mask]
|
|
188
|
+
read_span_mask = span_df.loc[ordered_index].to_numpy() == 0
|
|
189
|
+
|
|
190
|
+
L_ord = L_df.loc[ordered_index]
|
|
191
|
+
L_plot = L_ord.fillna(fill_layer_value)
|
|
192
|
+
if read_span_mask is not None:
|
|
193
|
+
L_plot = L_plot.mask(read_span_mask)
|
|
194
|
+
|
|
195
|
+
fig = plt.figure(figsize=figsize)
|
|
196
|
+
gs = fig.add_gridspec(
|
|
197
|
+
2,
|
|
198
|
+
4,
|
|
199
|
+
width_ratios=[1, 0.05, 1, 0.05],
|
|
200
|
+
height_ratios=[1, 6],
|
|
201
|
+
wspace=0.2,
|
|
202
|
+
hspace=0.05,
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
ax1 = fig.add_subplot(gs[1, 0])
|
|
206
|
+
ax1_cbar = fig.add_subplot(gs[1, 1])
|
|
207
|
+
ax2 = fig.add_subplot(gs[1, 2])
|
|
208
|
+
ax2_cbar = fig.add_subplot(gs[1, 3])
|
|
209
|
+
ax1_bar = fig.add_subplot(gs[0, 0], sharex=ax1)
|
|
210
|
+
ax2_bar = fig.add_subplot(gs[0, 2], sharex=ax2)
|
|
211
|
+
fig.add_subplot(gs[0, 1]).axis("off")
|
|
212
|
+
fig.add_subplot(gs[0, 3]).axis("off")
|
|
213
|
+
|
|
214
|
+
mean_nn = np.nanmean(X_ord.to_numpy(), axis=0)
|
|
215
|
+
clean_barplot(
|
|
216
|
+
ax1_bar,
|
|
217
|
+
mean_nn,
|
|
218
|
+
obsm_key,
|
|
219
|
+
y_max=None,
|
|
220
|
+
y_label="Mean distance",
|
|
221
|
+
y_ticks=None,
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
nn_cmap = plt.get_cmap("viridis").copy()
|
|
225
|
+
nn_cmap.set_bad(nn_nan_color)
|
|
226
|
+
sns.heatmap(
|
|
227
|
+
X_ord,
|
|
228
|
+
ax=ax1,
|
|
229
|
+
cmap=nn_cmap,
|
|
230
|
+
xticklabels=False,
|
|
231
|
+
yticklabels=False,
|
|
232
|
+
robust=robust,
|
|
233
|
+
cbar_ax=ax1_cbar,
|
|
234
|
+
)
|
|
235
|
+
label_source = subset.uns.get(f"{obsm_key}_centers")
|
|
236
|
+
if label_source is None:
|
|
237
|
+
label_source = subset.uns.get(f"{obsm_key}_starts")
|
|
238
|
+
if label_source is not None:
|
|
239
|
+
label_source = np.asarray(label_source)
|
|
240
|
+
window_labels = _format_labels(label_source)
|
|
241
|
+
try:
|
|
242
|
+
col_idx = X_ord.columns.to_numpy()
|
|
243
|
+
if np.issubdtype(col_idx.dtype, np.number):
|
|
244
|
+
col_idx = col_idx.astype(int)
|
|
245
|
+
if col_idx.size and col_idx.max() < len(label_source):
|
|
246
|
+
window_labels = _format_labels(label_source[col_idx])
|
|
247
|
+
except Exception:
|
|
248
|
+
window_labels = _format_labels(label_source)
|
|
249
|
+
_apply_xticks(ax1, window_labels, xtick_step)
|
|
250
|
+
|
|
251
|
+
methylation_fraction = _methylation_fraction_for_layer(L_plot.to_numpy(), layer_key)
|
|
252
|
+
clean_barplot(
|
|
253
|
+
ax2_bar,
|
|
254
|
+
methylation_fraction,
|
|
255
|
+
layer_key,
|
|
256
|
+
y_max=1.0,
|
|
257
|
+
y_label="Methylation fraction",
|
|
258
|
+
y_ticks=[0.0, 0.5, 1.0],
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
layer_cmap = plt.get_cmap("coolwarm").copy()
|
|
262
|
+
if read_span_mask is not None:
|
|
263
|
+
layer_cmap.set_bad(outside_read_color)
|
|
264
|
+
|
|
265
|
+
sns.heatmap(
|
|
266
|
+
L_plot,
|
|
267
|
+
ax=ax2,
|
|
268
|
+
cmap=layer_cmap,
|
|
269
|
+
xticklabels=False,
|
|
270
|
+
yticklabels=False,
|
|
271
|
+
robust=robust,
|
|
272
|
+
cbar_ax=ax2_cbar,
|
|
273
|
+
)
|
|
274
|
+
_apply_xticks(ax2, [str(x) for x in L_plot.columns], xtick_step)
|
|
275
|
+
|
|
276
|
+
if title:
|
|
277
|
+
fig.suptitle(title)
|
|
278
|
+
|
|
279
|
+
if save_name is not None:
|
|
280
|
+
fname = os.path.join(save_name)
|
|
281
|
+
plt.savefig(fname, dpi=200, bbox_inches="tight")
|
|
282
|
+
logger.info("Saved rolling NN/layer plot to %s.", fname)
|
|
283
|
+
else:
|
|
284
|
+
plt.show()
|
|
285
|
+
|
|
286
|
+
return ordered_index
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def plot_rolling_nn_and_two_layers(
|
|
290
|
+
subset,
|
|
291
|
+
obsm_key: str = "rolling_nn_dist",
|
|
292
|
+
layer_keys: Sequence[str] = ("nan0_0minus1", "nan0_0minus1"),
|
|
293
|
+
meta_cols: tuple[str, ...] = ("Reference_strand", "Sample"),
|
|
294
|
+
col_cluster: bool = False,
|
|
295
|
+
fill_nn_with_colmax: bool = True,
|
|
296
|
+
fill_layer_value: float = 0.0,
|
|
297
|
+
drop_all_nan_windows: bool = True,
|
|
298
|
+
max_nan_fraction: float | None = None,
|
|
299
|
+
var_valid_fraction_col: str | None = None,
|
|
300
|
+
var_nan_fraction_col: str | None = None,
|
|
301
|
+
read_span_layer: str | None = "read_span_mask",
|
|
302
|
+
outside_read_color: str = "#bdbdbd",
|
|
303
|
+
nn_nan_color: str = "#bdbdbd",
|
|
304
|
+
figsize: tuple[float, float] = (20, 10),
|
|
305
|
+
layer_var_mask=None,
|
|
306
|
+
robust: bool = True,
|
|
307
|
+
title: str | None = None,
|
|
308
|
+
xtick_step: int | None = None,
|
|
309
|
+
xtick_rotation: int = 90,
|
|
310
|
+
xtick_fontsize: int = 8,
|
|
311
|
+
save_name: str | None = None,
|
|
312
|
+
):
|
|
313
|
+
"""
|
|
314
|
+
Plot rolling NN distances alongside two layer clustermaps.
|
|
315
|
+
|
|
316
|
+
Args:
|
|
317
|
+
subset: AnnData subset with rolling NN distances stored in ``obsm``.
|
|
318
|
+
obsm_key: Key in ``subset.obsm`` containing rolling NN distances.
|
|
319
|
+
layer_keys: Two layer names to plot alongside rolling NN distances.
|
|
320
|
+
meta_cols: Obs columns used for row color annotations.
|
|
321
|
+
col_cluster: Whether to cluster columns in the rolling NN clustermap.
|
|
322
|
+
fill_nn_with_colmax: Fill NaNs in rolling NN distances with per-column max values.
|
|
323
|
+
fill_layer_value: Fill NaNs in the layer heatmaps with this value.
|
|
324
|
+
drop_all_nan_windows: Drop rolling windows that are all NaN.
|
|
325
|
+
max_nan_fraction: Maximum allowed NaN fraction per position (filtering columns).
|
|
326
|
+
var_valid_fraction_col: ``subset.var`` column with valid fractions (1 - NaN fraction).
|
|
327
|
+
var_nan_fraction_col: ``subset.var`` column with NaN fractions.
|
|
328
|
+
read_span_layer: Layer name with read span mask; 0 values are treated as outside read.
|
|
329
|
+
outside_read_color: Color used to show positions outside each read.
|
|
330
|
+
nn_nan_color: Color used for NaNs in the rolling NN heatmap.
|
|
331
|
+
figsize: Figure size for the combined plot.
|
|
332
|
+
layer_var_mask: Optional boolean mask over ``subset.var`` for the layer panels.
|
|
333
|
+
robust: Use robust color scaling in seaborn.
|
|
334
|
+
title: Optional figure title (suptitle).
|
|
335
|
+
xtick_step: Spacing between x-axis tick labels.
|
|
336
|
+
xtick_rotation: Rotation for x-axis tick labels.
|
|
337
|
+
xtick_fontsize: Font size for x-axis tick labels.
|
|
338
|
+
save_name: Optional output path for saving the plot.
|
|
339
|
+
"""
|
|
340
|
+
if len(layer_keys) != 2:
|
|
341
|
+
raise ValueError("layer_keys must contain exactly two layer names.")
|
|
342
|
+
if max_nan_fraction is not None and not (0 <= max_nan_fraction <= 1):
|
|
343
|
+
raise ValueError("max_nan_fraction must be between 0 and 1.")
|
|
344
|
+
|
|
345
|
+
layer_key_one, layer_key_two = layer_keys
|
|
346
|
+
logger.info(
|
|
347
|
+
"Plotting rolling NN distances with layers '%s' and '%s'.",
|
|
348
|
+
layer_key_one,
|
|
349
|
+
layer_key_two,
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
def _apply_xticks(ax, labels, step):
|
|
353
|
+
if labels is None or len(labels) == 0:
|
|
354
|
+
ax.set_xticks([])
|
|
355
|
+
return
|
|
356
|
+
if step is None or step <= 0:
|
|
357
|
+
step = max(1, len(labels) // 10)
|
|
358
|
+
ticks = np.arange(0, len(labels), step)
|
|
359
|
+
ax.set_xticks(ticks + 0.5)
|
|
360
|
+
ax.set_xticklabels(
|
|
361
|
+
[labels[i] for i in ticks],
|
|
362
|
+
rotation=xtick_rotation,
|
|
363
|
+
fontsize=xtick_fontsize,
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
def _format_labels(values):
|
|
367
|
+
values = np.asarray(values)
|
|
368
|
+
if np.issubdtype(values.dtype, np.number):
|
|
369
|
+
if np.all(np.isfinite(values)) and np.all(np.isclose(values, np.round(values))):
|
|
370
|
+
values = np.round(values).astype(int)
|
|
371
|
+
return [str(v) for v in values]
|
|
372
|
+
|
|
373
|
+
X = subset.obsm[obsm_key]
|
|
374
|
+
valid = ~np.all(np.isnan(X), axis=1)
|
|
375
|
+
|
|
376
|
+
X_df = pd.DataFrame(X[valid], index=subset.obs_names[valid])
|
|
377
|
+
if drop_all_nan_windows:
|
|
378
|
+
X_df = X_df.loc[:, ~X_df.isna().all(axis=0)]
|
|
379
|
+
|
|
380
|
+
col_max = X_df.max(axis=0, skipna=True).fillna(0)
|
|
381
|
+
X_df_cluster = X_df.fillna(col_max)
|
|
382
|
+
X_df_cluster.index = X_df_cluster.index.astype(str)
|
|
383
|
+
if fill_nn_with_colmax:
|
|
384
|
+
X_df_display = X_df_cluster
|
|
385
|
+
else:
|
|
386
|
+
X_df_display = X_df.copy()
|
|
387
|
+
X_df_display.index = X_df_display.index.astype(str)
|
|
388
|
+
|
|
389
|
+
meta = subset.obs.loc[X_df_cluster.index, list(meta_cols)].copy()
|
|
390
|
+
meta.index = meta.index.astype(str)
|
|
391
|
+
row_colors = make_row_colors(meta)
|
|
392
|
+
|
|
393
|
+
g = sns.clustermap(
|
|
394
|
+
X_df_cluster,
|
|
395
|
+
cmap="viridis",
|
|
396
|
+
col_cluster=col_cluster,
|
|
397
|
+
row_cluster=True,
|
|
398
|
+
row_colors=row_colors,
|
|
399
|
+
xticklabels=False,
|
|
400
|
+
yticklabels=False,
|
|
401
|
+
robust=robust,
|
|
402
|
+
)
|
|
403
|
+
row_order = g.dendrogram_row.reordered_ind
|
|
404
|
+
ordered_index = X_df_cluster.index[row_order]
|
|
405
|
+
plt.close(g.fig)
|
|
406
|
+
|
|
407
|
+
X_ord = X_df_display.loc[ordered_index]
|
|
408
|
+
|
|
409
|
+
if layer_var_mask is not None:
|
|
410
|
+
if hasattr(layer_var_mask, "values"):
|
|
411
|
+
layer_var_mask = layer_var_mask.values
|
|
412
|
+
layer_var_mask = np.asarray(layer_var_mask, dtype=bool)
|
|
413
|
+
|
|
414
|
+
if max_nan_fraction is not None:
|
|
415
|
+
nan_fraction = None
|
|
416
|
+
if var_nan_fraction_col and var_nan_fraction_col in subset.var:
|
|
417
|
+
nan_fraction = pd.to_numeric(
|
|
418
|
+
subset.var[var_nan_fraction_col], errors="coerce"
|
|
419
|
+
).to_numpy()
|
|
420
|
+
elif var_valid_fraction_col and var_valid_fraction_col in subset.var:
|
|
421
|
+
valid_fraction = pd.to_numeric(
|
|
422
|
+
subset.var[var_valid_fraction_col], errors="coerce"
|
|
423
|
+
).to_numpy()
|
|
424
|
+
nan_fraction = 1 - valid_fraction
|
|
425
|
+
if nan_fraction is not None:
|
|
426
|
+
nan_mask = nan_fraction <= max_nan_fraction
|
|
427
|
+
if layer_var_mask is None:
|
|
428
|
+
layer_var_mask = nan_mask
|
|
429
|
+
else:
|
|
430
|
+
layer_var_mask = layer_var_mask & nan_mask
|
|
431
|
+
|
|
432
|
+
if layer_var_mask is not None and layer_var_mask.size != subset.n_vars:
|
|
433
|
+
raise ValueError("layer_var_mask must align with subset.var_names.")
|
|
434
|
+
|
|
435
|
+
read_span_mask = None
|
|
436
|
+
if read_span_layer and read_span_layer in subset.layers:
|
|
437
|
+
span = subset.layers[read_span_layer]
|
|
438
|
+
span = span.toarray() if hasattr(span, "toarray") else np.asarray(span)
|
|
439
|
+
span_df = pd.DataFrame(span[valid], index=subset.obs_names[valid], columns=subset.var_names)
|
|
440
|
+
span_df.index = span_df.index.astype(str)
|
|
441
|
+
if layer_var_mask is not None:
|
|
442
|
+
span_df = span_df.loc[:, layer_var_mask]
|
|
443
|
+
read_span_mask = span_df.loc[ordered_index].to_numpy() == 0
|
|
444
|
+
|
|
445
|
+
def _layer_df_for_key(layer_key: str) -> pd.DataFrame:
|
|
446
|
+
layer = subset.layers[layer_key]
|
|
447
|
+
layer = layer.toarray() if hasattr(layer, "toarray") else np.asarray(layer)
|
|
448
|
+
layer_df = pd.DataFrame(
|
|
449
|
+
layer[valid], index=subset.obs_names[valid], columns=subset.var_names
|
|
450
|
+
)
|
|
451
|
+
layer_df.index = layer_df.index.astype(str)
|
|
452
|
+
if layer_var_mask is not None:
|
|
453
|
+
layer_df = layer_df.loc[:, layer_var_mask]
|
|
454
|
+
return layer_df.loc[ordered_index]
|
|
455
|
+
|
|
456
|
+
layer_df_one = _layer_df_for_key(layer_key_one)
|
|
457
|
+
layer_df_two = _layer_df_for_key(layer_key_two)
|
|
458
|
+
|
|
459
|
+
layer_plot_one = layer_df_one.fillna(fill_layer_value)
|
|
460
|
+
layer_plot_two = layer_df_two.fillna(fill_layer_value)
|
|
461
|
+
if read_span_mask is not None:
|
|
462
|
+
layer_plot_one = layer_plot_one.mask(read_span_mask)
|
|
463
|
+
layer_plot_two = layer_plot_two.mask(read_span_mask)
|
|
464
|
+
|
|
465
|
+
fig = plt.figure(figsize=figsize)
|
|
466
|
+
gs = fig.add_gridspec(
|
|
467
|
+
2,
|
|
468
|
+
6,
|
|
469
|
+
width_ratios=[1, 0.05, 1, 0.05, 1, 0.05],
|
|
470
|
+
height_ratios=[1, 6],
|
|
471
|
+
wspace=0.2,
|
|
472
|
+
hspace=0.05,
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
ax1 = fig.add_subplot(gs[1, 0])
|
|
476
|
+
ax1_cbar = fig.add_subplot(gs[1, 1])
|
|
477
|
+
ax2 = fig.add_subplot(gs[1, 2])
|
|
478
|
+
ax2_cbar = fig.add_subplot(gs[1, 3])
|
|
479
|
+
ax3 = fig.add_subplot(gs[1, 4])
|
|
480
|
+
ax3_cbar = fig.add_subplot(gs[1, 5])
|
|
481
|
+
ax1_bar = fig.add_subplot(gs[0, 0], sharex=ax1)
|
|
482
|
+
ax2_bar = fig.add_subplot(gs[0, 2], sharex=ax2)
|
|
483
|
+
ax3_bar = fig.add_subplot(gs[0, 4], sharex=ax3)
|
|
484
|
+
fig.add_subplot(gs[0, 1]).axis("off")
|
|
485
|
+
fig.add_subplot(gs[0, 3]).axis("off")
|
|
486
|
+
fig.add_subplot(gs[0, 5]).axis("off")
|
|
487
|
+
|
|
488
|
+
mean_nn = np.nanmean(X_ord.to_numpy(), axis=0)
|
|
489
|
+
clean_barplot(
|
|
490
|
+
ax1_bar,
|
|
491
|
+
mean_nn,
|
|
492
|
+
obsm_key,
|
|
493
|
+
y_max=None,
|
|
494
|
+
y_label="Mean distance",
|
|
495
|
+
y_ticks=None,
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
nn_cmap = plt.get_cmap("viridis").copy()
|
|
499
|
+
nn_cmap.set_bad(nn_nan_color)
|
|
500
|
+
sns.heatmap(
|
|
501
|
+
X_ord,
|
|
502
|
+
ax=ax1,
|
|
503
|
+
cmap=nn_cmap,
|
|
504
|
+
xticklabels=False,
|
|
505
|
+
yticklabels=False,
|
|
506
|
+
robust=robust,
|
|
507
|
+
cbar_ax=ax1_cbar,
|
|
508
|
+
)
|
|
509
|
+
label_source = subset.uns.get(f"{obsm_key}_centers")
|
|
510
|
+
if label_source is None:
|
|
511
|
+
label_source = subset.uns.get(f"{obsm_key}_starts")
|
|
512
|
+
if label_source is not None:
|
|
513
|
+
label_source = np.asarray(label_source)
|
|
514
|
+
window_labels = _format_labels(label_source)
|
|
515
|
+
try:
|
|
516
|
+
col_idx = X_ord.columns.to_numpy()
|
|
517
|
+
if np.issubdtype(col_idx.dtype, np.number):
|
|
518
|
+
col_idx = col_idx.astype(int)
|
|
519
|
+
if col_idx.size and col_idx.max() < len(label_source):
|
|
520
|
+
window_labels = _format_labels(label_source[col_idx])
|
|
521
|
+
except Exception:
|
|
522
|
+
window_labels = _format_labels(label_source)
|
|
523
|
+
_apply_xticks(ax1, window_labels, xtick_step)
|
|
524
|
+
|
|
525
|
+
for ax_bar, lp, layer_key in (
|
|
526
|
+
(ax2_bar, layer_plot_one, layer_key_one),
|
|
527
|
+
(ax3_bar, layer_plot_two, layer_key_two),
|
|
528
|
+
):
|
|
529
|
+
methylation_fraction = _methylation_fraction_for_layer(lp.to_numpy(), layer_key)
|
|
530
|
+
clean_barplot(
|
|
531
|
+
ax_bar,
|
|
532
|
+
methylation_fraction,
|
|
533
|
+
layer_key,
|
|
534
|
+
y_max=1.0,
|
|
535
|
+
y_label="Methylation fraction",
|
|
536
|
+
y_ticks=[0.0, 0.5, 1.0],
|
|
537
|
+
)
|
|
538
|
+
|
|
539
|
+
layer_cmap = plt.get_cmap("coolwarm").copy()
|
|
540
|
+
if read_span_mask is not None:
|
|
541
|
+
layer_cmap.set_bad(outside_read_color)
|
|
542
|
+
|
|
543
|
+
layer2_cmap = plt.get_cmap("Greens").copy()
|
|
544
|
+
if read_span_mask is not None:
|
|
545
|
+
layer2_cmap.set_bad(outside_read_color)
|
|
546
|
+
|
|
547
|
+
sns.heatmap(
|
|
548
|
+
layer_plot_one,
|
|
549
|
+
ax=ax2,
|
|
550
|
+
cmap=layer_cmap,
|
|
551
|
+
xticklabels=False,
|
|
552
|
+
yticklabels=False,
|
|
553
|
+
robust=robust,
|
|
554
|
+
cbar_ax=ax2_cbar,
|
|
555
|
+
)
|
|
556
|
+
sns.heatmap(
|
|
557
|
+
layer_plot_two,
|
|
558
|
+
ax=ax3,
|
|
559
|
+
cmap=layer2_cmap,
|
|
560
|
+
xticklabels=False,
|
|
561
|
+
yticklabels=False,
|
|
562
|
+
robust=robust,
|
|
563
|
+
cbar_ax=ax3_cbar,
|
|
564
|
+
)
|
|
565
|
+
_apply_xticks(ax2, [str(x) for x in layer_plot_one.columns], xtick_step)
|
|
566
|
+
_apply_xticks(ax3, [str(x) for x in layer_plot_two.columns], xtick_step)
|
|
567
|
+
|
|
568
|
+
if title:
|
|
569
|
+
fig.suptitle(title)
|
|
570
|
+
|
|
571
|
+
if save_name is not None:
|
|
572
|
+
fname = os.path.join(save_name)
|
|
573
|
+
plt.savefig(fname, dpi=200, bbox_inches="tight")
|
|
574
|
+
logger.info("Saved rolling NN/layer pair plot to %s.", fname)
|
|
575
|
+
else:
|
|
576
|
+
plt.show()
|
|
577
|
+
|
|
578
|
+
return ordered_index
|
|
579
|
+
|
|
580
|
+
|
|
581
|
+
def plot_zero_hamming_span_and_layer(
|
|
582
|
+
subset,
|
|
583
|
+
span_layer_key: str,
|
|
584
|
+
layer_key: str = "nan0_0minus1",
|
|
585
|
+
meta_cols: tuple[str, ...] = ("Reference_strand", "Sample"),
|
|
586
|
+
col_cluster: bool = False,
|
|
587
|
+
fill_span_value: float = 0.0,
|
|
588
|
+
fill_layer_value: float = 0.0,
|
|
589
|
+
drop_all_nan_positions: bool = True,
|
|
590
|
+
max_nan_fraction: float | None = None,
|
|
591
|
+
var_valid_fraction_col: str | None = None,
|
|
592
|
+
var_nan_fraction_col: str | None = None,
|
|
593
|
+
read_span_layer: str | None = "read_span_mask",
|
|
594
|
+
outside_read_color: str = "#bdbdbd",
|
|
595
|
+
span_color: str = "#2ca25f",
|
|
596
|
+
figsize: tuple[float, float] = (14, 10),
|
|
597
|
+
robust: bool = True,
|
|
598
|
+
title: str | None = None,
|
|
599
|
+
xtick_step: int | None = None,
|
|
600
|
+
xtick_rotation: int = 90,
|
|
601
|
+
xtick_fontsize: int = 8,
|
|
602
|
+
variant_call_data: "pd.DataFrame | None" = None,
|
|
603
|
+
seq1_label: str = "seq1",
|
|
604
|
+
seq2_label: str = "seq2",
|
|
605
|
+
ref1_marker_color: str = "white",
|
|
606
|
+
ref2_marker_color: str = "black",
|
|
607
|
+
variant_marker_size: float = 4.0,
|
|
608
|
+
save_name: str | None = None,
|
|
609
|
+
):
|
|
610
|
+
"""
|
|
611
|
+
Plot zero-Hamming span clustermap alongside a layer clustermap.
|
|
612
|
+
|
|
613
|
+
Args:
|
|
614
|
+
subset: AnnData subset with zero-Hamming span annotations stored in ``layers``.
|
|
615
|
+
span_layer_key: Layer name with the binary zero-Hamming span mask.
|
|
616
|
+
layer_key: Layer name to plot alongside the span mask.
|
|
617
|
+
meta_cols: Obs columns used for row color annotations.
|
|
618
|
+
col_cluster: Whether to cluster columns in the span mask clustermap.
|
|
619
|
+
fill_span_value: Value to fill NaNs in the span mask.
|
|
620
|
+
fill_layer_value: Value to fill NaNs in the layer heatmap.
|
|
621
|
+
drop_all_nan_positions: Drop positions that are all NaN in the span mask.
|
|
622
|
+
max_nan_fraction: Maximum allowed NaN fraction per position (filtering columns).
|
|
623
|
+
var_valid_fraction_col: ``subset.var`` column with valid fractions (1 - NaN fraction).
|
|
624
|
+
var_nan_fraction_col: ``subset.var`` column with NaN fractions.
|
|
625
|
+
read_span_layer: Layer name with read span mask; 0 values are treated as outside read.
|
|
626
|
+
outside_read_color: Color used to show positions outside each read.
|
|
627
|
+
span_color: Color for zero-Hamming span mask values.
|
|
628
|
+
figsize: Figure size for the combined plot.
|
|
629
|
+
robust: Use robust color scaling in seaborn.
|
|
630
|
+
title: Optional figure title (suptitle).
|
|
631
|
+
xtick_step: Spacing between x-axis tick labels.
|
|
632
|
+
xtick_rotation: Rotation for x-axis tick labels.
|
|
633
|
+
xtick_fontsize: Font size for x-axis tick labels.
|
|
634
|
+
variant_call_data: Optional DataFrame (obs × full var_names) with variant calls
|
|
635
|
+
(1=seq1, 2=seq2). When provided, circles are overlaid at positions that
|
|
636
|
+
overlap with the plotted columns. Built from the full-width adata before
|
|
637
|
+
column filtering so mismatch sites outside modification sites are mapped.
|
|
638
|
+
seq1_label: Label for seq1 in the legend.
|
|
639
|
+
seq2_label: Label for seq2 in the legend.
|
|
640
|
+
ref1_marker_color: Circle color for seq1 variant calls.
|
|
641
|
+
ref2_marker_color: Circle color for seq2 variant calls.
|
|
642
|
+
variant_marker_size: Size of variant call overlay circles.
|
|
643
|
+
save_name: Optional output path for saving the plot.
|
|
644
|
+
"""
|
|
645
|
+
if max_nan_fraction is not None and not (0 <= max_nan_fraction <= 1):
|
|
646
|
+
raise ValueError("max_nan_fraction must be between 0 and 1.")
|
|
647
|
+
|
|
648
|
+
logger.info(
|
|
649
|
+
"Plotting zero-Hamming span mask '%s' with layer '%s'.",
|
|
650
|
+
span_layer_key,
|
|
651
|
+
layer_key,
|
|
652
|
+
)
|
|
653
|
+
|
|
654
|
+
def _apply_xticks(ax, labels, step):
|
|
655
|
+
if labels is None or len(labels) == 0:
|
|
656
|
+
ax.set_xticks([])
|
|
657
|
+
return
|
|
658
|
+
if step is None or step <= 0:
|
|
659
|
+
step = max(1, len(labels) // 10)
|
|
660
|
+
ticks = np.arange(0, len(labels), step)
|
|
661
|
+
ax.set_xticks(ticks + 0.5)
|
|
662
|
+
ax.set_xticklabels(
|
|
663
|
+
[labels[i] for i in ticks],
|
|
664
|
+
rotation=xtick_rotation,
|
|
665
|
+
fontsize=xtick_fontsize,
|
|
666
|
+
)
|
|
667
|
+
|
|
668
|
+
span = subset.layers[span_layer_key]
|
|
669
|
+
span = span.toarray() if hasattr(span, "toarray") else np.asarray(span)
|
|
670
|
+
span_df = pd.DataFrame(span, index=subset.obs_names, columns=subset.var_names)
|
|
671
|
+
span_df.index = span_df.index.astype(str)
|
|
672
|
+
|
|
673
|
+
if drop_all_nan_positions:
|
|
674
|
+
span_df = span_df.loc[:, ~span_df.isna().all(axis=0)]
|
|
675
|
+
|
|
676
|
+
nan_mask = None
|
|
677
|
+
if max_nan_fraction is not None:
|
|
678
|
+
nan_fraction = None
|
|
679
|
+
if var_nan_fraction_col and var_nan_fraction_col in subset.var:
|
|
680
|
+
nan_fraction = pd.to_numeric(
|
|
681
|
+
subset.var[var_nan_fraction_col], errors="coerce"
|
|
682
|
+
).to_numpy()
|
|
683
|
+
elif var_valid_fraction_col and var_valid_fraction_col in subset.var:
|
|
684
|
+
valid_fraction = pd.to_numeric(
|
|
685
|
+
subset.var[var_valid_fraction_col], errors="coerce"
|
|
686
|
+
).to_numpy()
|
|
687
|
+
nan_fraction = 1 - valid_fraction
|
|
688
|
+
if nan_fraction is not None:
|
|
689
|
+
nan_mask = nan_fraction <= max_nan_fraction
|
|
690
|
+
span_df = span_df.loc[:, nan_mask]
|
|
691
|
+
|
|
692
|
+
span_df_filled = span_df.fillna(fill_span_value)
|
|
693
|
+
span_df_filled.index = span_df_filled.index.astype(str)
|
|
694
|
+
|
|
695
|
+
meta = subset.obs.loc[span_df.index, list(meta_cols)].copy()
|
|
696
|
+
meta.index = meta.index.astype(str)
|
|
697
|
+
row_colors = make_row_colors(meta)
|
|
698
|
+
|
|
699
|
+
span_cmap = colors.ListedColormap(["white", span_color])
|
|
700
|
+
span_norm = colors.BoundaryNorm([-0.5, 0.5, 1.5], span_cmap.N)
|
|
701
|
+
|
|
702
|
+
g = sns.clustermap(
|
|
703
|
+
span_df_filled,
|
|
704
|
+
cmap=span_cmap,
|
|
705
|
+
norm=span_norm,
|
|
706
|
+
col_cluster=col_cluster,
|
|
707
|
+
row_cluster=True,
|
|
708
|
+
row_colors=row_colors,
|
|
709
|
+
xticklabels=False,
|
|
710
|
+
yticklabels=False,
|
|
711
|
+
robust=robust,
|
|
712
|
+
)
|
|
713
|
+
row_order = g.dendrogram_row.reordered_ind
|
|
714
|
+
ordered_index = span_df_filled.index[row_order]
|
|
715
|
+
plt.close(g.fig)
|
|
716
|
+
|
|
717
|
+
span_ord = span_df_filled.loc[ordered_index]
|
|
718
|
+
|
|
719
|
+
layer = subset.layers[layer_key]
|
|
720
|
+
layer = layer.toarray() if hasattr(layer, "toarray") else np.asarray(layer)
|
|
721
|
+
layer_df = pd.DataFrame(layer, index=subset.obs_names, columns=subset.var_names)
|
|
722
|
+
layer_df.index = layer_df.index.astype(str)
|
|
723
|
+
|
|
724
|
+
if max_nan_fraction is not None and nan_mask is not None:
|
|
725
|
+
layer_df = layer_df.loc[:, nan_mask]
|
|
726
|
+
|
|
727
|
+
read_span_mask = None
|
|
728
|
+
if read_span_layer and read_span_layer in subset.layers:
|
|
729
|
+
span_mask = subset.layers[read_span_layer]
|
|
730
|
+
span_mask = span_mask.toarray() if hasattr(span_mask, "toarray") else np.asarray(span_mask)
|
|
731
|
+
span_mask_df = pd.DataFrame(span_mask, index=subset.obs_names, columns=subset.var_names)
|
|
732
|
+
span_mask_df.index = span_mask_df.index.astype(str)
|
|
733
|
+
if max_nan_fraction is not None and nan_mask is not None:
|
|
734
|
+
span_mask_df = span_mask_df.loc[:, nan_mask]
|
|
735
|
+
read_span_mask = span_mask_df.loc[ordered_index].to_numpy() == 0
|
|
736
|
+
|
|
737
|
+
layer_ord = layer_df.loc[ordered_index]
|
|
738
|
+
layer_plot = layer_ord.fillna(fill_layer_value)
|
|
739
|
+
if read_span_mask is not None:
|
|
740
|
+
layer_plot = layer_plot.mask(read_span_mask)
|
|
741
|
+
|
|
742
|
+
# Apply read span mask to span layer for barplot and heatmap
|
|
743
|
+
span_plot = span_ord.copy()
|
|
744
|
+
if read_span_mask is not None:
|
|
745
|
+
span_plot = span_plot.mask(read_span_mask)
|
|
746
|
+
|
|
747
|
+
fig = plt.figure(figsize=figsize)
|
|
748
|
+
gs = fig.add_gridspec(
|
|
749
|
+
2,
|
|
750
|
+
4,
|
|
751
|
+
width_ratios=[1, 0.05, 1, 0.05],
|
|
752
|
+
height_ratios=[1, 6],
|
|
753
|
+
wspace=0.2,
|
|
754
|
+
hspace=0.05,
|
|
755
|
+
)
|
|
756
|
+
|
|
757
|
+
ax1 = fig.add_subplot(gs[1, 0])
|
|
758
|
+
ax1_cbar = fig.add_subplot(gs[1, 1])
|
|
759
|
+
ax2 = fig.add_subplot(gs[1, 2])
|
|
760
|
+
ax2_cbar = fig.add_subplot(gs[1, 3])
|
|
761
|
+
ax1_bar = fig.add_subplot(gs[0, 0], sharex=ax1)
|
|
762
|
+
ax2_bar = fig.add_subplot(gs[0, 2], sharex=ax2)
|
|
763
|
+
fig.add_subplot(gs[0, 1]).axis("off")
|
|
764
|
+
fig.add_subplot(gs[0, 3]).axis("off")
|
|
765
|
+
|
|
766
|
+
mean_span = np.nanmean(span_plot.to_numpy(), axis=0)
|
|
767
|
+
clean_barplot(
|
|
768
|
+
ax1_bar,
|
|
769
|
+
mean_span,
|
|
770
|
+
span_layer_key,
|
|
771
|
+
y_max=1.0,
|
|
772
|
+
y_label="Span fraction",
|
|
773
|
+
y_ticks=[0.0, 0.5, 1.0],
|
|
774
|
+
)
|
|
775
|
+
|
|
776
|
+
methylation_fraction = _methylation_fraction_for_layer(layer_ord.to_numpy(), layer_key)
|
|
777
|
+
clean_barplot(
|
|
778
|
+
ax2_bar,
|
|
779
|
+
methylation_fraction,
|
|
780
|
+
layer_key,
|
|
781
|
+
y_max=1.0,
|
|
782
|
+
y_label="Methylation fraction",
|
|
783
|
+
y_ticks=[0.0, 0.5, 1.0],
|
|
784
|
+
)
|
|
785
|
+
|
|
786
|
+
span_cmap.set_bad(outside_read_color)
|
|
787
|
+
sns.heatmap(
|
|
788
|
+
span_plot,
|
|
789
|
+
ax=ax1,
|
|
790
|
+
cmap=span_cmap,
|
|
791
|
+
norm=span_norm,
|
|
792
|
+
xticklabels=False,
|
|
793
|
+
yticklabels=False,
|
|
794
|
+
robust=robust,
|
|
795
|
+
cbar_ax=ax1_cbar,
|
|
796
|
+
)
|
|
797
|
+
|
|
798
|
+
layer_cmap = plt.get_cmap("coolwarm").copy()
|
|
799
|
+
if read_span_mask is not None:
|
|
800
|
+
layer_cmap.set_bad(outside_read_color)
|
|
801
|
+
|
|
802
|
+
sns.heatmap(
|
|
803
|
+
layer_plot,
|
|
804
|
+
ax=ax2,
|
|
805
|
+
cmap=layer_cmap,
|
|
806
|
+
xticklabels=False,
|
|
807
|
+
yticklabels=False,
|
|
808
|
+
robust=robust,
|
|
809
|
+
cbar_ax=ax2_cbar,
|
|
810
|
+
)
|
|
811
|
+
|
|
812
|
+
# Overlay variant call circles on both heatmaps if data is available
|
|
813
|
+
if variant_call_data is not None:
|
|
814
|
+
plotted_cols = span_ord.columns
|
|
815
|
+
# Convert plotted column names to numeric positions for nearest-neighbour mapping
|
|
816
|
+
try:
|
|
817
|
+
plotted_positions = np.array([float(c) for c in plotted_cols])
|
|
818
|
+
except (ValueError, TypeError):
|
|
819
|
+
plotted_positions = None
|
|
820
|
+
|
|
821
|
+
if plotted_positions is not None and len(plotted_positions) > 0:
|
|
822
|
+
# Find variant call columns that have any calls
|
|
823
|
+
call_cols_mask = variant_call_data.isin([1, 2]).any(axis=0)
|
|
824
|
+
call_col_names = variant_call_data.columns[call_cols_mask]
|
|
825
|
+
try:
|
|
826
|
+
call_col_positions = np.array([float(c) for c in call_col_names])
|
|
827
|
+
except (ValueError, TypeError):
|
|
828
|
+
call_col_positions = None
|
|
829
|
+
|
|
830
|
+
if call_col_positions is not None and len(call_col_positions) > 0:
|
|
831
|
+
# Map each variant call position to the nearest plotted heatmap column
|
|
832
|
+
insert_idx = np.searchsorted(plotted_positions, call_col_positions)
|
|
833
|
+
insert_idx = np.clip(insert_idx, 0, len(plotted_positions) - 1)
|
|
834
|
+
# Also check the index to the left in case it's closer
|
|
835
|
+
left_idx = np.clip(insert_idx - 1, 0, len(plotted_positions) - 1)
|
|
836
|
+
dist_right = np.abs(plotted_positions[insert_idx] - call_col_positions)
|
|
837
|
+
dist_left = np.abs(plotted_positions[left_idx] - call_col_positions)
|
|
838
|
+
nearest_heatmap_col = np.where(dist_left < dist_right, left_idx, insert_idx)
|
|
839
|
+
|
|
840
|
+
call_sub = variant_call_data.loc[:, call_col_names]
|
|
841
|
+
call_sub.index = call_sub.index.astype(str)
|
|
842
|
+
common_rows = [r for r in ordered_index if r in call_sub.index]
|
|
843
|
+
if common_rows:
|
|
844
|
+
call_ord = call_sub.loc[common_rows].to_numpy()
|
|
845
|
+
row_index_map = {r: i for i, r in enumerate(ordered_index)}
|
|
846
|
+
heatmap_row_indices = np.array([row_index_map[r] for r in common_rows])
|
|
847
|
+
|
|
848
|
+
for call_val, marker_color, label in [
|
|
849
|
+
(1, ref1_marker_color, f"{seq1_label} call"),
|
|
850
|
+
(2, ref2_marker_color, f"{seq2_label} call"),
|
|
851
|
+
]:
|
|
852
|
+
local_rows, local_cols = np.where(call_ord == call_val)
|
|
853
|
+
if len(local_rows) == 0:
|
|
854
|
+
continue
|
|
855
|
+
plot_y = heatmap_row_indices[local_rows]
|
|
856
|
+
plot_x = nearest_heatmap_col[local_cols]
|
|
857
|
+
for ax in (ax1, ax2):
|
|
858
|
+
ax.scatter(
|
|
859
|
+
plot_x + 0.5,
|
|
860
|
+
plot_y + 0.5,
|
|
861
|
+
c=marker_color,
|
|
862
|
+
s=variant_marker_size,
|
|
863
|
+
marker="o",
|
|
864
|
+
edgecolors="gray",
|
|
865
|
+
linewidths=0.3,
|
|
866
|
+
zorder=3,
|
|
867
|
+
label=label,
|
|
868
|
+
)
|
|
869
|
+
|
|
870
|
+
_apply_xticks(ax1, [str(x) for x in span_ord.columns], xtick_step)
|
|
871
|
+
_apply_xticks(ax2, [str(x) for x in layer_plot.columns], xtick_step)
|
|
872
|
+
|
|
873
|
+
if title:
|
|
874
|
+
fig.suptitle(title)
|
|
875
|
+
|
|
876
|
+
if save_name is not None:
|
|
877
|
+
fname = os.path.join(save_name)
|
|
878
|
+
plt.savefig(fname, dpi=200, bbox_inches="tight")
|
|
879
|
+
logger.info("Saved zero-Hamming span/layer plot to %s.", fname)
|
|
880
|
+
else:
|
|
881
|
+
plt.show()
|
|
882
|
+
|
|
883
|
+
return ordered_index
|
|
884
|
+
|
|
885
|
+
|
|
886
|
+
def plot_delta_hamming_summary(
|
|
887
|
+
subset,
|
|
888
|
+
self_obsm_key: str = "rolling_nn_dist",
|
|
889
|
+
cross_obsm_key: str = "rolling_nn_dist",
|
|
890
|
+
layer_key: str = "nan0_0minus1",
|
|
891
|
+
self_span_layer_key: str = "zero_hamming_distance_spans",
|
|
892
|
+
cross_span_layer_key: str = "cross_sample_zero_hamming_distance_spans",
|
|
893
|
+
delta_span_layer_key: str = "delta_zero_hamming_distance_spans",
|
|
894
|
+
meta_cols: tuple[str, ...] = ("Reference_strand", "Sample"),
|
|
895
|
+
col_cluster: bool = False,
|
|
896
|
+
fill_nn_with_colmax: bool = True,
|
|
897
|
+
fill_layer_value: float = 0.0,
|
|
898
|
+
fill_span_value: float = 0.0,
|
|
899
|
+
drop_all_nan_windows: bool = True,
|
|
900
|
+
max_nan_fraction: float | None = None,
|
|
901
|
+
var_valid_fraction_col: str | None = None,
|
|
902
|
+
var_nan_fraction_col: str | None = None,
|
|
903
|
+
read_span_layer: str | None = "read_span_mask",
|
|
904
|
+
outside_read_color: str = "#bdbdbd",
|
|
905
|
+
nn_nan_color: str = "#bdbdbd",
|
|
906
|
+
span_color: str = "#2ca25f",
|
|
907
|
+
cross_span_color: str = "#e6550d",
|
|
908
|
+
delta_span_color: str = "#756bb1",
|
|
909
|
+
figsize: tuple[float, float] = (30, 24),
|
|
910
|
+
robust: bool = True,
|
|
911
|
+
title: str | None = None,
|
|
912
|
+
xtick_step: int | None = None,
|
|
913
|
+
xtick_rotation: int = 90,
|
|
914
|
+
xtick_fontsize: int = 8,
|
|
915
|
+
save_name: str | None = None,
|
|
916
|
+
):
|
|
917
|
+
"""
|
|
918
|
+
Plot a 2×3 summary: row 1 = self NN, cross NN, signal layer;
|
|
919
|
+
row 2 = self hamming spans, cross hamming spans, delta hamming spans.
|
|
920
|
+
|
|
921
|
+
Cluster order is determined by the delta hamming span layer.
|
|
922
|
+
Barplots are drawn above each clustermap.
|
|
923
|
+
|
|
924
|
+
Args:
|
|
925
|
+
subset: AnnData subset with all required obsm/layers.
|
|
926
|
+
self_obsm_key: obsm key for within-sample rolling NN distances.
|
|
927
|
+
cross_obsm_key: obsm key for cross-sample rolling NN distances.
|
|
928
|
+
layer_key: Signal layer to plot in top-right panel.
|
|
929
|
+
self_span_layer_key: Layer with within-sample zero-Hamming spans.
|
|
930
|
+
cross_span_layer_key: Layer with cross-sample zero-Hamming spans.
|
|
931
|
+
delta_span_layer_key: Layer with delta (self - cross) zero-Hamming spans.
|
|
932
|
+
meta_cols: Obs columns for row color annotations.
|
|
933
|
+
col_cluster: Cluster columns.
|
|
934
|
+
fill_nn_with_colmax: Fill NN NaNs with per-column max for display.
|
|
935
|
+
fill_layer_value: Fill NaN in signal layer.
|
|
936
|
+
fill_span_value: Fill NaN in span layers.
|
|
937
|
+
drop_all_nan_windows: Drop all-NaN rolling NN windows.
|
|
938
|
+
max_nan_fraction: Max NaN fraction filter for layer columns.
|
|
939
|
+
var_valid_fraction_col: Var column with valid fraction.
|
|
940
|
+
var_nan_fraction_col: Var column with NaN fraction.
|
|
941
|
+
read_span_layer: Layer with read span mask.
|
|
942
|
+
outside_read_color: Color for outside-read positions.
|
|
943
|
+
nn_nan_color: Color for NaN in NN heatmaps.
|
|
944
|
+
span_color: Color for self hamming span (1 values).
|
|
945
|
+
cross_span_color: Color for cross hamming span (1 values).
|
|
946
|
+
delta_span_color: Color for delta hamming span (1 values).
|
|
947
|
+
figsize: Figure size.
|
|
948
|
+
robust: Robust color scaling.
|
|
949
|
+
title: Figure suptitle.
|
|
950
|
+
xtick_step: Spacing between x-tick labels.
|
|
951
|
+
xtick_rotation: X-tick label rotation.
|
|
952
|
+
xtick_fontsize: X-tick label font size.
|
|
953
|
+
save_name: Output path.
|
|
954
|
+
"""
|
|
955
|
+
logger.info(
|
|
956
|
+
"Plotting delta hamming summary: self_nn=%s cross_nn=%s delta_span=%s.",
|
|
957
|
+
self_obsm_key,
|
|
958
|
+
cross_obsm_key,
|
|
959
|
+
delta_span_layer_key,
|
|
960
|
+
)
|
|
961
|
+
|
|
962
|
+
def _apply_xticks(ax, labels, step):
|
|
963
|
+
if labels is None or len(labels) == 0:
|
|
964
|
+
ax.set_xticks([])
|
|
965
|
+
return
|
|
966
|
+
if step is None or step <= 0:
|
|
967
|
+
step = max(1, len(labels) // 10)
|
|
968
|
+
ticks = np.arange(0, len(labels), step)
|
|
969
|
+
ax.set_xticks(ticks + 0.5)
|
|
970
|
+
ax.set_xticklabels(
|
|
971
|
+
[labels[i] for i in ticks],
|
|
972
|
+
rotation=xtick_rotation,
|
|
973
|
+
fontsize=xtick_fontsize,
|
|
974
|
+
)
|
|
975
|
+
|
|
976
|
+
def _format_labels(values):
|
|
977
|
+
values = np.asarray(values)
|
|
978
|
+
if np.issubdtype(values.dtype, np.number):
|
|
979
|
+
if np.all(np.isfinite(values)) and np.all(np.isclose(values, np.round(values))):
|
|
980
|
+
values = np.round(values).astype(int)
|
|
981
|
+
return [str(v) for v in values]
|
|
982
|
+
|
|
983
|
+
# --- Determine row order from delta span layer ---
|
|
984
|
+
delta_span = subset.layers[delta_span_layer_key]
|
|
985
|
+
delta_span = delta_span.toarray() if hasattr(delta_span, "toarray") else np.asarray(delta_span)
|
|
986
|
+
delta_span_df = pd.DataFrame(delta_span, index=subset.obs_names, columns=subset.var_names)
|
|
987
|
+
delta_span_df.index = delta_span_df.index.astype(str)
|
|
988
|
+
|
|
989
|
+
# NaN fraction filtering for layer columns
|
|
990
|
+
nan_mask = None
|
|
991
|
+
if max_nan_fraction is not None:
|
|
992
|
+
nan_fraction = None
|
|
993
|
+
if var_nan_fraction_col and var_nan_fraction_col in subset.var:
|
|
994
|
+
nan_fraction = pd.to_numeric(
|
|
995
|
+
subset.var[var_nan_fraction_col], errors="coerce"
|
|
996
|
+
).to_numpy()
|
|
997
|
+
elif var_valid_fraction_col and var_valid_fraction_col in subset.var:
|
|
998
|
+
valid_fraction = pd.to_numeric(
|
|
999
|
+
subset.var[var_valid_fraction_col], errors="coerce"
|
|
1000
|
+
).to_numpy()
|
|
1001
|
+
nan_fraction = 1 - valid_fraction
|
|
1002
|
+
if nan_fraction is not None:
|
|
1003
|
+
nan_mask = nan_fraction <= max_nan_fraction
|
|
1004
|
+
delta_span_df = delta_span_df.loc[:, nan_mask]
|
|
1005
|
+
|
|
1006
|
+
delta_span_filled = delta_span_df.fillna(fill_span_value)
|
|
1007
|
+
delta_span_filled.index = delta_span_filled.index.astype(str)
|
|
1008
|
+
|
|
1009
|
+
meta = subset.obs.loc[delta_span_df.index, list(meta_cols)].copy()
|
|
1010
|
+
meta.index = meta.index.astype(str)
|
|
1011
|
+
row_colors = make_row_colors(meta)
|
|
1012
|
+
|
|
1013
|
+
delta_cmap = colors.ListedColormap(["white", delta_span_color])
|
|
1014
|
+
delta_norm = colors.BoundaryNorm([-0.5, 0.5, 1.5], delta_cmap.N)
|
|
1015
|
+
|
|
1016
|
+
g = sns.clustermap(
|
|
1017
|
+
delta_span_filled,
|
|
1018
|
+
cmap=delta_cmap,
|
|
1019
|
+
norm=delta_norm,
|
|
1020
|
+
col_cluster=col_cluster,
|
|
1021
|
+
row_cluster=True,
|
|
1022
|
+
row_colors=row_colors,
|
|
1023
|
+
xticklabels=False,
|
|
1024
|
+
yticklabels=False,
|
|
1025
|
+
robust=robust,
|
|
1026
|
+
)
|
|
1027
|
+
row_order = g.dendrogram_row.reordered_ind
|
|
1028
|
+
ordered_index = delta_span_filled.index[row_order]
|
|
1029
|
+
plt.close(g.fig)
|
|
1030
|
+
|
|
1031
|
+
# --- Helper to extract + order a span layer ---
|
|
1032
|
+
def _span_df(layer_key):
|
|
1033
|
+
raw = subset.layers[layer_key]
|
|
1034
|
+
raw = raw.toarray() if hasattr(raw, "toarray") else np.asarray(raw)
|
|
1035
|
+
df = pd.DataFrame(raw, index=subset.obs_names, columns=subset.var_names)
|
|
1036
|
+
df.index = df.index.astype(str)
|
|
1037
|
+
if nan_mask is not None:
|
|
1038
|
+
df = df.loc[:, nan_mask]
|
|
1039
|
+
return df.loc[ordered_index].fillna(fill_span_value)
|
|
1040
|
+
|
|
1041
|
+
self_span_ord = _span_df(self_span_layer_key)
|
|
1042
|
+
cross_span_ord = _span_df(cross_span_layer_key)
|
|
1043
|
+
delta_span_ord = delta_span_filled.loc[ordered_index]
|
|
1044
|
+
|
|
1045
|
+
# --- Read span mask for layer-resolution panels ---
|
|
1046
|
+
read_span_outside = None
|
|
1047
|
+
if read_span_layer and read_span_layer in subset.layers:
|
|
1048
|
+
rsm = subset.layers[read_span_layer]
|
|
1049
|
+
rsm = rsm.toarray() if hasattr(rsm, "toarray") else np.asarray(rsm)
|
|
1050
|
+
rsm_df = pd.DataFrame(rsm, index=subset.obs_names, columns=subset.var_names)
|
|
1051
|
+
rsm_df.index = rsm_df.index.astype(str)
|
|
1052
|
+
if nan_mask is not None:
|
|
1053
|
+
rsm_df = rsm_df.loc[:, nan_mask]
|
|
1054
|
+
read_span_outside = rsm_df.loc[ordered_index].to_numpy() == 0
|
|
1055
|
+
|
|
1056
|
+
# Apply read span mask to span layers (NaN outside read → grey in heatmap, excluded from barplot)
|
|
1057
|
+
self_span_plot = self_span_ord.copy()
|
|
1058
|
+
cross_span_plot = cross_span_ord.copy()
|
|
1059
|
+
delta_span_plot = delta_span_ord.copy()
|
|
1060
|
+
if read_span_outside is not None:
|
|
1061
|
+
self_span_plot = self_span_plot.mask(read_span_outside)
|
|
1062
|
+
cross_span_plot = cross_span_plot.mask(read_span_outside)
|
|
1063
|
+
delta_span_plot = delta_span_plot.mask(read_span_outside)
|
|
1064
|
+
|
|
1065
|
+
# --- NN data ---
|
|
1066
|
+
def _nn_df(obsm_key):
|
|
1067
|
+
X = subset.obsm[obsm_key]
|
|
1068
|
+
valid = ~np.all(np.isnan(X), axis=1)
|
|
1069
|
+
df = pd.DataFrame(X, index=subset.obs_names)
|
|
1070
|
+
df.index = df.index.astype(str)
|
|
1071
|
+
if drop_all_nan_windows:
|
|
1072
|
+
df = df.loc[:, ~df.isna().all(axis=0)]
|
|
1073
|
+
col_max = df.max(axis=0, skipna=True).fillna(0)
|
|
1074
|
+
df_cluster = df.fillna(col_max)
|
|
1075
|
+
if fill_nn_with_colmax:
|
|
1076
|
+
df_display = df_cluster
|
|
1077
|
+
else:
|
|
1078
|
+
df_display = df.copy()
|
|
1079
|
+
return df_display.loc[ordered_index]
|
|
1080
|
+
|
|
1081
|
+
self_nn_ord = _nn_df(self_obsm_key)
|
|
1082
|
+
cross_nn_ord = _nn_df(cross_obsm_key)
|
|
1083
|
+
|
|
1084
|
+
# --- Signal layer ---
|
|
1085
|
+
layer_raw = subset.layers[layer_key]
|
|
1086
|
+
layer_raw = layer_raw.toarray() if hasattr(layer_raw, "toarray") else np.asarray(layer_raw)
|
|
1087
|
+
layer_df = pd.DataFrame(layer_raw, index=subset.obs_names, columns=subset.var_names)
|
|
1088
|
+
layer_df.index = layer_df.index.astype(str)
|
|
1089
|
+
if nan_mask is not None:
|
|
1090
|
+
layer_df = layer_df.loc[:, nan_mask]
|
|
1091
|
+
|
|
1092
|
+
layer_ord = layer_df.loc[ordered_index]
|
|
1093
|
+
layer_plot = layer_ord.fillna(fill_layer_value)
|
|
1094
|
+
if read_span_outside is not None:
|
|
1095
|
+
layer_plot = layer_plot.mask(read_span_outside)
|
|
1096
|
+
|
|
1097
|
+
# --- Figure layout: 5 rows × 6 cols ---
|
|
1098
|
+
# row 0: barplots for top row
|
|
1099
|
+
# row 1: heatmaps (self NN, cross NN, signal)
|
|
1100
|
+
# row 2: spacer
|
|
1101
|
+
# row 3: barplots for bottom row
|
|
1102
|
+
# row 4: heatmaps (self span, cross span, delta span)
|
|
1103
|
+
fig = plt.figure(figsize=figsize)
|
|
1104
|
+
gs = fig.add_gridspec(
|
|
1105
|
+
5,
|
|
1106
|
+
6,
|
|
1107
|
+
width_ratios=[1, 0.05, 1, 0.05, 1, 0.05],
|
|
1108
|
+
height_ratios=[1, 8, 0.8, 1, 8],
|
|
1109
|
+
wspace=0.2,
|
|
1110
|
+
hspace=0.05,
|
|
1111
|
+
)
|
|
1112
|
+
|
|
1113
|
+
# Row 1 heatmaps + colorbars
|
|
1114
|
+
ax_self_nn = fig.add_subplot(gs[1, 0])
|
|
1115
|
+
ax_self_nn_cbar = fig.add_subplot(gs[1, 1])
|
|
1116
|
+
ax_cross_nn = fig.add_subplot(gs[1, 2])
|
|
1117
|
+
ax_cross_nn_cbar = fig.add_subplot(gs[1, 3])
|
|
1118
|
+
ax_signal = fig.add_subplot(gs[1, 4])
|
|
1119
|
+
ax_signal_cbar = fig.add_subplot(gs[1, 5])
|
|
1120
|
+
|
|
1121
|
+
# Row 1 barplots
|
|
1122
|
+
ax_self_nn_bar = fig.add_subplot(gs[0, 0], sharex=ax_self_nn)
|
|
1123
|
+
ax_cross_nn_bar = fig.add_subplot(gs[0, 2], sharex=ax_cross_nn)
|
|
1124
|
+
ax_signal_bar = fig.add_subplot(gs[0, 4], sharex=ax_signal)
|
|
1125
|
+
fig.add_subplot(gs[0, 1]).axis("off")
|
|
1126
|
+
fig.add_subplot(gs[0, 3]).axis("off")
|
|
1127
|
+
fig.add_subplot(gs[0, 5]).axis("off")
|
|
1128
|
+
|
|
1129
|
+
# Spacer row
|
|
1130
|
+
for col in range(6):
|
|
1131
|
+
fig.add_subplot(gs[2, col]).axis("off")
|
|
1132
|
+
|
|
1133
|
+
# Row 2 heatmaps + colorbars
|
|
1134
|
+
ax_self_span = fig.add_subplot(gs[4, 0])
|
|
1135
|
+
ax_self_span_cbar = fig.add_subplot(gs[4, 1])
|
|
1136
|
+
ax_cross_span = fig.add_subplot(gs[4, 2])
|
|
1137
|
+
ax_cross_span_cbar = fig.add_subplot(gs[4, 3])
|
|
1138
|
+
ax_delta_span = fig.add_subplot(gs[4, 4])
|
|
1139
|
+
ax_delta_span_cbar = fig.add_subplot(gs[4, 5])
|
|
1140
|
+
|
|
1141
|
+
# Row 2 barplots
|
|
1142
|
+
ax_self_span_bar = fig.add_subplot(gs[3, 0], sharex=ax_self_span)
|
|
1143
|
+
ax_cross_span_bar = fig.add_subplot(gs[3, 2], sharex=ax_cross_span)
|
|
1144
|
+
ax_delta_span_bar = fig.add_subplot(gs[3, 4], sharex=ax_delta_span)
|
|
1145
|
+
fig.add_subplot(gs[3, 1]).axis("off")
|
|
1146
|
+
fig.add_subplot(gs[3, 3]).axis("off")
|
|
1147
|
+
fig.add_subplot(gs[3, 5]).axis("off")
|
|
1148
|
+
|
|
1149
|
+
# --- Row 1: NN + signal barplots ---
|
|
1150
|
+
mean_self_nn = np.nanmean(self_nn_ord.to_numpy(), axis=0)
|
|
1151
|
+
mean_cross_nn = np.nanmean(cross_nn_ord.to_numpy(), axis=0)
|
|
1152
|
+
nn_y_max = float(np.nanmax(np.concatenate([mean_self_nn, mean_cross_nn])))
|
|
1153
|
+
if not np.isfinite(nn_y_max) or nn_y_max <= 0:
|
|
1154
|
+
nn_y_max = None
|
|
1155
|
+
clean_barplot(
|
|
1156
|
+
ax_self_nn_bar,
|
|
1157
|
+
mean_self_nn,
|
|
1158
|
+
"Self NN",
|
|
1159
|
+
y_max=nn_y_max,
|
|
1160
|
+
y_label="Mean distance",
|
|
1161
|
+
y_ticks=None,
|
|
1162
|
+
)
|
|
1163
|
+
clean_barplot(
|
|
1164
|
+
ax_cross_nn_bar,
|
|
1165
|
+
mean_cross_nn,
|
|
1166
|
+
"Cross NN",
|
|
1167
|
+
y_max=nn_y_max,
|
|
1168
|
+
y_label="Mean distance",
|
|
1169
|
+
y_ticks=None,
|
|
1170
|
+
)
|
|
1171
|
+
methylation_fraction = _methylation_fraction_for_layer(layer_ord.to_numpy(), layer_key)
|
|
1172
|
+
clean_barplot(
|
|
1173
|
+
ax_signal_bar,
|
|
1174
|
+
methylation_fraction,
|
|
1175
|
+
layer_key,
|
|
1176
|
+
y_max=1.0,
|
|
1177
|
+
y_label="Methylation fraction",
|
|
1178
|
+
y_ticks=[0.0, 0.5, 1.0],
|
|
1179
|
+
)
|
|
1180
|
+
|
|
1181
|
+
# --- Row 1: NN + signal heatmaps ---
|
|
1182
|
+
nn_cmap = plt.get_cmap("viridis").copy()
|
|
1183
|
+
nn_cmap.set_bad(nn_nan_color)
|
|
1184
|
+
|
|
1185
|
+
sns.heatmap(
|
|
1186
|
+
self_nn_ord,
|
|
1187
|
+
ax=ax_self_nn,
|
|
1188
|
+
cmap=nn_cmap,
|
|
1189
|
+
xticklabels=False,
|
|
1190
|
+
yticklabels=False,
|
|
1191
|
+
robust=robust,
|
|
1192
|
+
cbar_ax=ax_self_nn_cbar,
|
|
1193
|
+
)
|
|
1194
|
+
sns.heatmap(
|
|
1195
|
+
cross_nn_ord,
|
|
1196
|
+
ax=ax_cross_nn,
|
|
1197
|
+
cmap=nn_cmap,
|
|
1198
|
+
xticklabels=False,
|
|
1199
|
+
yticklabels=False,
|
|
1200
|
+
robust=robust,
|
|
1201
|
+
cbar_ax=ax_cross_nn_cbar,
|
|
1202
|
+
)
|
|
1203
|
+
|
|
1204
|
+
layer_cmap = plt.get_cmap("coolwarm").copy()
|
|
1205
|
+
if read_span_outside is not None:
|
|
1206
|
+
layer_cmap.set_bad(outside_read_color)
|
|
1207
|
+
sns.heatmap(
|
|
1208
|
+
layer_plot,
|
|
1209
|
+
ax=ax_signal,
|
|
1210
|
+
cmap=layer_cmap,
|
|
1211
|
+
xticklabels=False,
|
|
1212
|
+
yticklabels=False,
|
|
1213
|
+
robust=robust,
|
|
1214
|
+
cbar_ax=ax_signal_cbar,
|
|
1215
|
+
)
|
|
1216
|
+
|
|
1217
|
+
# NN x-tick labels
|
|
1218
|
+
for ax_nn, obsm_key in ((ax_self_nn, self_obsm_key), (ax_cross_nn, cross_obsm_key)):
|
|
1219
|
+
label_source = subset.uns.get(f"{obsm_key}_centers")
|
|
1220
|
+
if label_source is None:
|
|
1221
|
+
label_source = subset.uns.get(f"{obsm_key}_starts")
|
|
1222
|
+
if label_source is not None:
|
|
1223
|
+
_apply_xticks(ax_nn, _format_labels(np.asarray(label_source)), xtick_step)
|
|
1224
|
+
|
|
1225
|
+
_apply_xticks(ax_signal, [str(x) for x in layer_plot.columns], xtick_step)
|
|
1226
|
+
|
|
1227
|
+
# --- Row 2: span barplots (matched y-scale, using read-span-masked data) ---
|
|
1228
|
+
mean_self_span = np.nanmean(self_span_plot.to_numpy(), axis=0)
|
|
1229
|
+
mean_cross_span = np.nanmean(cross_span_plot.to_numpy(), axis=0)
|
|
1230
|
+
mean_delta_span = np.nanmean(delta_span_plot.to_numpy(), axis=0)
|
|
1231
|
+
span_y_max = float(
|
|
1232
|
+
np.nanmax(np.concatenate([mean_self_span, mean_cross_span, mean_delta_span]))
|
|
1233
|
+
)
|
|
1234
|
+
if not np.isfinite(span_y_max) or span_y_max <= 0:
|
|
1235
|
+
span_y_max = 1.0
|
|
1236
|
+
# Round up to nearest 0.1 for clean ticks
|
|
1237
|
+
span_y_max = np.ceil(span_y_max * 10) / 10
|
|
1238
|
+
span_y_ticks = [0.0, span_y_max / 2, span_y_max]
|
|
1239
|
+
clean_barplot(
|
|
1240
|
+
ax_self_span_bar,
|
|
1241
|
+
mean_self_span,
|
|
1242
|
+
"Self spans",
|
|
1243
|
+
y_max=span_y_max,
|
|
1244
|
+
y_label="Span fraction",
|
|
1245
|
+
y_ticks=span_y_ticks,
|
|
1246
|
+
)
|
|
1247
|
+
clean_barplot(
|
|
1248
|
+
ax_cross_span_bar,
|
|
1249
|
+
mean_cross_span,
|
|
1250
|
+
"Cross spans",
|
|
1251
|
+
y_max=span_y_max,
|
|
1252
|
+
y_label="Span fraction",
|
|
1253
|
+
y_ticks=span_y_ticks,
|
|
1254
|
+
)
|
|
1255
|
+
clean_barplot(
|
|
1256
|
+
ax_delta_span_bar,
|
|
1257
|
+
mean_delta_span,
|
|
1258
|
+
"Delta spans",
|
|
1259
|
+
y_max=span_y_max,
|
|
1260
|
+
y_label="Span fraction",
|
|
1261
|
+
y_ticks=span_y_ticks,
|
|
1262
|
+
)
|
|
1263
|
+
|
|
1264
|
+
# --- Row 2: span heatmaps (read-span-masked, outside-read = grey) ---
|
|
1265
|
+
self_span_cmap = colors.ListedColormap(["white", span_color])
|
|
1266
|
+
self_span_norm = colors.BoundaryNorm([-0.5, 0.5, 1.5], self_span_cmap.N)
|
|
1267
|
+
self_span_cmap.set_bad(outside_read_color)
|
|
1268
|
+
cross_span_cmap = colors.ListedColormap(["white", cross_span_color])
|
|
1269
|
+
cross_span_norm = colors.BoundaryNorm([-0.5, 0.5, 1.5], cross_span_cmap.N)
|
|
1270
|
+
cross_span_cmap.set_bad(outside_read_color)
|
|
1271
|
+
delta_cmap.set_bad(outside_read_color)
|
|
1272
|
+
|
|
1273
|
+
sns.heatmap(
|
|
1274
|
+
self_span_plot,
|
|
1275
|
+
ax=ax_self_span,
|
|
1276
|
+
cmap=self_span_cmap,
|
|
1277
|
+
norm=self_span_norm,
|
|
1278
|
+
xticklabels=False,
|
|
1279
|
+
yticklabels=False,
|
|
1280
|
+
robust=robust,
|
|
1281
|
+
cbar_ax=ax_self_span_cbar,
|
|
1282
|
+
)
|
|
1283
|
+
sns.heatmap(
|
|
1284
|
+
cross_span_plot,
|
|
1285
|
+
ax=ax_cross_span,
|
|
1286
|
+
cmap=cross_span_cmap,
|
|
1287
|
+
norm=cross_span_norm,
|
|
1288
|
+
xticklabels=False,
|
|
1289
|
+
yticklabels=False,
|
|
1290
|
+
robust=robust,
|
|
1291
|
+
cbar_ax=ax_cross_span_cbar,
|
|
1292
|
+
)
|
|
1293
|
+
sns.heatmap(
|
|
1294
|
+
delta_span_plot,
|
|
1295
|
+
ax=ax_delta_span,
|
|
1296
|
+
cmap=delta_cmap,
|
|
1297
|
+
norm=delta_norm,
|
|
1298
|
+
xticklabels=False,
|
|
1299
|
+
yticklabels=False,
|
|
1300
|
+
robust=robust,
|
|
1301
|
+
cbar_ax=ax_delta_span_cbar,
|
|
1302
|
+
)
|
|
1303
|
+
|
|
1304
|
+
col_labels = [str(x) for x in self_span_ord.columns]
|
|
1305
|
+
for ax in (ax_self_span, ax_cross_span, ax_delta_span):
|
|
1306
|
+
_apply_xticks(ax, col_labels, xtick_step)
|
|
1307
|
+
|
|
1308
|
+
if title:
|
|
1309
|
+
fig.suptitle(title)
|
|
1310
|
+
|
|
1311
|
+
if save_name is not None:
|
|
1312
|
+
fname = os.path.join(save_name)
|
|
1313
|
+
plt.savefig(fname, dpi=200, bbox_inches="tight")
|
|
1314
|
+
logger.info("Saved delta hamming summary plot to %s.", fname)
|
|
1315
|
+
else:
|
|
1316
|
+
plt.show()
|
|
1317
|
+
|
|
1318
|
+
plt.close(fig)
|
|
1319
|
+
return ordered_index
|
|
1320
|
+
|
|
1321
|
+
|
|
1322
|
+
def plot_span_length_distributions(
|
|
1323
|
+
subset,
|
|
1324
|
+
self_span_layer_key: str = "zero_hamming_distance_spans",
|
|
1325
|
+
cross_span_layer_key: str = "cross_sample_zero_hamming_distance_spans",
|
|
1326
|
+
delta_span_layer_key: str = "delta_zero_hamming_distance_spans",
|
|
1327
|
+
read_span_layer: str | None = "read_span_mask",
|
|
1328
|
+
bins: int = 30,
|
|
1329
|
+
self_color: str = "#2ca25f",
|
|
1330
|
+
cross_color: str = "#e6550d",
|
|
1331
|
+
delta_color: str = "#756bb1",
|
|
1332
|
+
figsize: tuple[float, float] = (10, 6),
|
|
1333
|
+
title: str | None = None,
|
|
1334
|
+
save_name: str | None = None,
|
|
1335
|
+
):
|
|
1336
|
+
"""
|
|
1337
|
+
Overlay probability histograms of contiguous span lengths from three layers.
|
|
1338
|
+
|
|
1339
|
+
Span length is measured in base-pair coordinates using ``subset.var_names``.
|
|
1340
|
+
Positions outside the valid read span (where ``read_span_layer == 0``) are
|
|
1341
|
+
excluded before detecting contiguous runs.
|
|
1342
|
+
|
|
1343
|
+
Args:
|
|
1344
|
+
subset: AnnData subset containing the span layers.
|
|
1345
|
+
self_span_layer_key: Layer with within-sample zero-Hamming spans.
|
|
1346
|
+
cross_span_layer_key: Layer with cross-sample zero-Hamming spans.
|
|
1347
|
+
delta_span_layer_key: Layer with delta (self - cross) spans.
|
|
1348
|
+
read_span_layer: Layer with read span mask; 0 = outside read.
|
|
1349
|
+
bins: Number of histogram bins.
|
|
1350
|
+
self_color: Histogram color for self spans.
|
|
1351
|
+
cross_color: Histogram color for cross spans.
|
|
1352
|
+
delta_color: Histogram color for delta spans.
|
|
1353
|
+
figsize: Figure size.
|
|
1354
|
+
title: Figure title.
|
|
1355
|
+
save_name: Output path.
|
|
1356
|
+
"""
|
|
1357
|
+
|
|
1358
|
+
def _extract_span_lengths(layer_arr, positions, read_mask):
|
|
1359
|
+
"""Extract lengths (in bp) of contiguous runs of 1 in each row."""
|
|
1360
|
+
lengths = []
|
|
1361
|
+
for i in range(layer_arr.shape[0]):
|
|
1362
|
+
row = layer_arr[i].copy()
|
|
1363
|
+
if read_mask is not None:
|
|
1364
|
+
row[~read_mask[i]] = 0
|
|
1365
|
+
# Find contiguous runs of 1
|
|
1366
|
+
diff = np.diff(np.concatenate(([0], row.astype(np.int8), [0])))
|
|
1367
|
+
starts = np.where(diff == 1)[0]
|
|
1368
|
+
ends = np.where(diff == -1)[0]
|
|
1369
|
+
for s, e in zip(starts, ends):
|
|
1370
|
+
if e > s:
|
|
1371
|
+
span_bp = float(positions[e - 1] - positions[s])
|
|
1372
|
+
if span_bp > 0:
|
|
1373
|
+
lengths.append(span_bp)
|
|
1374
|
+
return np.array(lengths, dtype=float)
|
|
1375
|
+
|
|
1376
|
+
# Parse genomic positions from var_names
|
|
1377
|
+
try:
|
|
1378
|
+
positions = np.array(subset.var_names, dtype=float)
|
|
1379
|
+
except (ValueError, TypeError):
|
|
1380
|
+
positions = np.arange(subset.n_vars, dtype=float)
|
|
1381
|
+
|
|
1382
|
+
# Read span mask
|
|
1383
|
+
read_mask = None
|
|
1384
|
+
if read_span_layer and read_span_layer in subset.layers:
|
|
1385
|
+
rsm = subset.layers[read_span_layer]
|
|
1386
|
+
rsm = rsm.toarray() if hasattr(rsm, "toarray") else np.asarray(rsm)
|
|
1387
|
+
read_mask = rsm.astype(bool)
|
|
1388
|
+
|
|
1389
|
+
entries = []
|
|
1390
|
+
for layer_key, color, label in (
|
|
1391
|
+
(self_span_layer_key, self_color, "Self"),
|
|
1392
|
+
(cross_span_layer_key, cross_color, "Cross"),
|
|
1393
|
+
(delta_span_layer_key, delta_color, "Delta"),
|
|
1394
|
+
):
|
|
1395
|
+
if layer_key not in subset.layers:
|
|
1396
|
+
continue
|
|
1397
|
+
arr = subset.layers[layer_key]
|
|
1398
|
+
arr = arr.toarray() if hasattr(arr, "toarray") else np.asarray(arr)
|
|
1399
|
+
span_lengths = _extract_span_lengths(arr, positions, read_mask)
|
|
1400
|
+
entries.append((label, color, span_lengths))
|
|
1401
|
+
|
|
1402
|
+
if not entries:
|
|
1403
|
+
logger.warning("No span layers found for span length distribution plot.")
|
|
1404
|
+
return
|
|
1405
|
+
|
|
1406
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
1407
|
+
|
|
1408
|
+
for label, color, span_lengths in entries:
|
|
1409
|
+
if len(span_lengths) == 0:
|
|
1410
|
+
ax.axhline(0, color=color, label=f"{label} (n=0)")
|
|
1411
|
+
continue
|
|
1412
|
+
ax.hist(
|
|
1413
|
+
span_lengths,
|
|
1414
|
+
bins=bins,
|
|
1415
|
+
density=True,
|
|
1416
|
+
alpha=0.5,
|
|
1417
|
+
color=color,
|
|
1418
|
+
label=f"{label} (n={len(span_lengths)})",
|
|
1419
|
+
edgecolor="black",
|
|
1420
|
+
linewidth=0.5,
|
|
1421
|
+
)
|
|
1422
|
+
|
|
1423
|
+
ax.set_xlabel("Span length (bp)")
|
|
1424
|
+
ax.set_ylabel("Probability density")
|
|
1425
|
+
ax.legend()
|
|
1426
|
+
|
|
1427
|
+
if title:
|
|
1428
|
+
ax.set_title(title)
|
|
1429
|
+
|
|
1430
|
+
if save_name is not None:
|
|
1431
|
+
fname = os.path.join(save_name)
|
|
1432
|
+
plt.savefig(fname, dpi=200, bbox_inches="tight")
|
|
1433
|
+
logger.info("Saved span length distribution plot to %s.", fname)
|
|
1434
|
+
else:
|
|
1435
|
+
plt.show()
|
|
1436
|
+
|
|
1437
|
+
plt.close(fig)
|
|
1438
|
+
|
|
1439
|
+
|
|
1440
|
+
def _window_center_labels(var_names: Sequence, starts: np.ndarray, window: int) -> list[str]:
|
|
1441
|
+
coords = np.asarray(var_names)
|
|
1442
|
+
if coords.size == 0:
|
|
1443
|
+
return []
|
|
1444
|
+
try:
|
|
1445
|
+
coords_numeric = coords.astype(float)
|
|
1446
|
+
centers = np.array(
|
|
1447
|
+
[floor(np.nanmean(coords_numeric[s : s + window])) for s in starts], dtype=float
|
|
1448
|
+
)
|
|
1449
|
+
return [str(c) for c in centers]
|
|
1450
|
+
except Exception:
|
|
1451
|
+
mid = np.clip(starts + (window // 2), 0, coords.size - 1)
|
|
1452
|
+
return [str(coords[idx]) for idx in mid]
|
|
1453
|
+
|
|
1454
|
+
|
|
1455
|
+
def plot_zero_hamming_pair_counts(
|
|
1456
|
+
subset,
|
|
1457
|
+
zero_pairs_uns_key: str,
|
|
1458
|
+
meta_cols: tuple[str, ...] = ("Reference_strand", "Sample"),
|
|
1459
|
+
col_cluster: bool = False,
|
|
1460
|
+
figsize: tuple[float, float] = (14, 10),
|
|
1461
|
+
robust: bool = True,
|
|
1462
|
+
title: str | None = None,
|
|
1463
|
+
xtick_step: int | None = None,
|
|
1464
|
+
xtick_rotation: int = 90,
|
|
1465
|
+
xtick_fontsize: int = 8,
|
|
1466
|
+
save_name: str | None = None,
|
|
1467
|
+
):
|
|
1468
|
+
"""
|
|
1469
|
+
Plot a heatmap of zero-Hamming pair counts per read across rolling windows.
|
|
1470
|
+
|
|
1471
|
+
Args:
|
|
1472
|
+
subset: AnnData subset containing zero-pair window data in ``.uns``.
|
|
1473
|
+
zero_pairs_uns_key: Key in ``subset.uns`` with zero-pair window data.
|
|
1474
|
+
meta_cols: Obs columns used for row color annotations.
|
|
1475
|
+
col_cluster: Whether to cluster columns in the heatmap.
|
|
1476
|
+
figsize: Figure size for the plot.
|
|
1477
|
+
robust: Use robust color scaling in seaborn.
|
|
1478
|
+
title: Optional figure title (suptitle).
|
|
1479
|
+
xtick_step: Spacing between x-axis tick labels.
|
|
1480
|
+
xtick_rotation: Rotation for x-axis tick labels.
|
|
1481
|
+
xtick_fontsize: Font size for x-axis tick labels.
|
|
1482
|
+
save_name: Optional output path for saving the plot.
|
|
1483
|
+
"""
|
|
1484
|
+
if zero_pairs_uns_key not in subset.uns:
|
|
1485
|
+
raise KeyError(f"Missing zero-pair data in subset.uns[{zero_pairs_uns_key!r}].")
|
|
1486
|
+
|
|
1487
|
+
zero_pairs_by_window = subset.uns[zero_pairs_uns_key]
|
|
1488
|
+
starts = np.asarray(subset.uns.get(f"{zero_pairs_uns_key}_starts", []))
|
|
1489
|
+
window = int(subset.uns.get(f"{zero_pairs_uns_key}_window", 0))
|
|
1490
|
+
|
|
1491
|
+
n_windows = len(zero_pairs_by_window)
|
|
1492
|
+
counts = np.zeros((subset.n_obs, n_windows), dtype=int)
|
|
1493
|
+
|
|
1494
|
+
for wi, pairs in enumerate(zero_pairs_by_window):
|
|
1495
|
+
if pairs is None or len(pairs) == 0:
|
|
1496
|
+
continue
|
|
1497
|
+
pair_arr = np.asarray(pairs, dtype=int)
|
|
1498
|
+
if pair_arr.size == 0:
|
|
1499
|
+
continue
|
|
1500
|
+
if pair_arr.ndim != 2 or pair_arr.shape[1] != 2:
|
|
1501
|
+
raise ValueError("Zero-pair entries must be arrays of shape (n, 2).")
|
|
1502
|
+
np.add.at(counts[:, wi], pair_arr[:, 0], 1)
|
|
1503
|
+
np.add.at(counts[:, wi], pair_arr[:, 1], 1)
|
|
1504
|
+
|
|
1505
|
+
if starts.size == n_windows and window > 0:
|
|
1506
|
+
labels = _window_center_labels(subset.var_names, starts, window)
|
|
1507
|
+
else:
|
|
1508
|
+
labels = [str(i) for i in range(n_windows)]
|
|
1509
|
+
|
|
1510
|
+
counts_df = pd.DataFrame(counts, index=subset.obs_names.astype(str), columns=labels)
|
|
1511
|
+
meta = subset.obs.loc[counts_df.index, list(meta_cols)].copy()
|
|
1512
|
+
meta.index = meta.index.astype(str)
|
|
1513
|
+
row_colors = make_row_colors(meta)
|
|
1514
|
+
|
|
1515
|
+
def _apply_xticks(ax, labels, step):
|
|
1516
|
+
if labels is None or len(labels) == 0:
|
|
1517
|
+
ax.set_xticks([])
|
|
1518
|
+
return
|
|
1519
|
+
if step is None or step <= 0:
|
|
1520
|
+
step = max(1, len(labels) // 10)
|
|
1521
|
+
ticks = np.arange(0, len(labels), step)
|
|
1522
|
+
ax.set_xticks(ticks + 0.5)
|
|
1523
|
+
ax.set_xticklabels(
|
|
1524
|
+
[labels[i] for i in ticks],
|
|
1525
|
+
rotation=xtick_rotation,
|
|
1526
|
+
fontsize=xtick_fontsize,
|
|
1527
|
+
)
|
|
1528
|
+
|
|
1529
|
+
g = sns.clustermap(
|
|
1530
|
+
counts_df,
|
|
1531
|
+
cmap="viridis",
|
|
1532
|
+
col_cluster=col_cluster,
|
|
1533
|
+
row_cluster=True,
|
|
1534
|
+
row_colors=row_colors,
|
|
1535
|
+
xticklabels=False,
|
|
1536
|
+
yticklabels=False,
|
|
1537
|
+
figsize=figsize,
|
|
1538
|
+
robust=robust,
|
|
1539
|
+
)
|
|
1540
|
+
_apply_xticks(g.ax_heatmap, labels, xtick_step)
|
|
1541
|
+
|
|
1542
|
+
if title:
|
|
1543
|
+
g.fig.suptitle(title)
|
|
1544
|
+
|
|
1545
|
+
if save_name is not None:
|
|
1546
|
+
fname = os.path.join(save_name)
|
|
1547
|
+
g.fig.savefig(fname, dpi=200, bbox_inches="tight")
|
|
1548
|
+
logger.info("Saved zero-Hamming pair count plot to %s.", fname)
|
|
1549
|
+
else:
|
|
1550
|
+
plt.show()
|
|
1551
|
+
|
|
1552
|
+
return g
|
|
1553
|
+
|
|
1554
|
+
|
|
1555
|
+
def plot_segment_length_histogram(
|
|
1556
|
+
raw_lengths: np.ndarray,
|
|
1557
|
+
filtered_lengths: np.ndarray,
|
|
1558
|
+
bins: int = 30,
|
|
1559
|
+
title: str | None = None,
|
|
1560
|
+
raw_label: str = "All segments",
|
|
1561
|
+
filtered_label: str = "Filtered segments",
|
|
1562
|
+
figsize: tuple[float, float] = (8, 4),
|
|
1563
|
+
density: bool = True,
|
|
1564
|
+
save_name: str | None = None,
|
|
1565
|
+
):
|
|
1566
|
+
"""
|
|
1567
|
+
Plot an overlay histogram of segment lengths for raw vs filtered spans.
|
|
1568
|
+
|
|
1569
|
+
Args:
|
|
1570
|
+
raw_lengths: Array of raw segment lengths.
|
|
1571
|
+
filtered_lengths: Array of filtered segment lengths.
|
|
1572
|
+
bins: Number of histogram bins.
|
|
1573
|
+
title: Optional plot title.
|
|
1574
|
+
raw_label: Label for raw segment histogram.
|
|
1575
|
+
filtered_label: Label for filtered segment histogram.
|
|
1576
|
+
figsize: Size of the matplotlib figure.
|
|
1577
|
+
density: If True, plot probabilities instead of counts.
|
|
1578
|
+
save_name: Optional output path for saving the plot.
|
|
1579
|
+
"""
|
|
1580
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
1581
|
+
if raw_lengths.size:
|
|
1582
|
+
ax.hist(
|
|
1583
|
+
raw_lengths,
|
|
1584
|
+
bins=bins,
|
|
1585
|
+
alpha=0.6,
|
|
1586
|
+
label=raw_label,
|
|
1587
|
+
edgecolor="black",
|
|
1588
|
+
density=density,
|
|
1589
|
+
)
|
|
1590
|
+
if filtered_lengths.size:
|
|
1591
|
+
ax.hist(
|
|
1592
|
+
filtered_lengths,
|
|
1593
|
+
bins=bins,
|
|
1594
|
+
alpha=0.6,
|
|
1595
|
+
label=filtered_label,
|
|
1596
|
+
edgecolor="black",
|
|
1597
|
+
density=density,
|
|
1598
|
+
)
|
|
1599
|
+
ax.set_xlabel("Segment length")
|
|
1600
|
+
ax.set_ylabel("Probability" if density else "Count")
|
|
1601
|
+
if title:
|
|
1602
|
+
ax.set_title(title)
|
|
1603
|
+
ax.legend()
|
|
1604
|
+
ax.grid(True, linestyle="--", alpha=0.3)
|
|
1605
|
+
|
|
1606
|
+
if save_name is not None:
|
|
1607
|
+
fname = os.path.join(save_name)
|
|
1608
|
+
fig.savefig(fname, dpi=200, bbox_inches="tight")
|
|
1609
|
+
logger.info("Saved segment length histogram to %s.", fname)
|
|
1610
|
+
else:
|
|
1611
|
+
plt.show()
|
|
1612
|
+
|
|
1613
|
+
plt.close(fig)
|
|
1614
|
+
return fig
|
|
1615
|
+
|
|
1616
|
+
|
|
1617
|
+
def plot_hamming_span_trio(
|
|
1618
|
+
subset,
|
|
1619
|
+
self_span_layer_key: str = "zero_hamming_distance_spans",
|
|
1620
|
+
cross_span_layer_key: str = "cross_sample_zero_hamming_distance_spans",
|
|
1621
|
+
delta_span_layer_key: str = "delta_zero_hamming_distance_spans",
|
|
1622
|
+
read_span_layer: str | None = "read_span_mask",
|
|
1623
|
+
outside_read_color: str = "#bdbdbd",
|
|
1624
|
+
span_color: str = "#2ca25f",
|
|
1625
|
+
cross_span_color: str = "#e6550d",
|
|
1626
|
+
delta_span_color: str = "#756bb1",
|
|
1627
|
+
figsize: tuple[float, float] = (16, 8),
|
|
1628
|
+
robust: bool = True,
|
|
1629
|
+
title: str | None = None,
|
|
1630
|
+
xtick_step: int | None = None,
|
|
1631
|
+
xtick_rotation: int = 90,
|
|
1632
|
+
xtick_fontsize: int = 8,
|
|
1633
|
+
variant_call_data: "pd.DataFrame | None" = None,
|
|
1634
|
+
seq1_label: str = "seq1",
|
|
1635
|
+
seq2_label: str = "seq2",
|
|
1636
|
+
ref1_marker_color: str = "white",
|
|
1637
|
+
ref2_marker_color: str = "black",
|
|
1638
|
+
variant_marker_size: float = 4.0,
|
|
1639
|
+
classification_obs_col: str | None = "chimeric_by_mod_hamming_distance",
|
|
1640
|
+
classification_true_color: str = "#000000",
|
|
1641
|
+
classification_false_color: str = "#f0f0f0",
|
|
1642
|
+
classification_panel_title: str = "Mod-hamming chimera",
|
|
1643
|
+
save_name: str | None = None,
|
|
1644
|
+
):
|
|
1645
|
+
"""
|
|
1646
|
+
Plot a 1×3 trio of hamming span clustermaps (self, cross, delta) with no
|
|
1647
|
+
column subsetting, optionally overlaying variant call circles.
|
|
1648
|
+
|
|
1649
|
+
Row order is determined by hierarchical clustering on the delta span layer.
|
|
1650
|
+
A barplot showing per-column mean span fraction is drawn above each panel.
|
|
1651
|
+
"""
|
|
1652
|
+
logger.info(
|
|
1653
|
+
"Plotting hamming span trio: self=%s, cross=%s, delta=%s.",
|
|
1654
|
+
self_span_layer_key,
|
|
1655
|
+
cross_span_layer_key,
|
|
1656
|
+
delta_span_layer_key,
|
|
1657
|
+
)
|
|
1658
|
+
|
|
1659
|
+
def _to_df(layer_key):
|
|
1660
|
+
arr = subset.layers[layer_key]
|
|
1661
|
+
arr = arr.toarray() if hasattr(arr, "toarray") else np.asarray(arr)
|
|
1662
|
+
df = pd.DataFrame(arr, index=subset.obs_names.astype(str), columns=subset.var_names)
|
|
1663
|
+
return df
|
|
1664
|
+
|
|
1665
|
+
def _apply_xticks(ax, labels, step):
|
|
1666
|
+
if labels is None or len(labels) == 0:
|
|
1667
|
+
ax.set_xticks([])
|
|
1668
|
+
return
|
|
1669
|
+
if step is None or step <= 0:
|
|
1670
|
+
step = max(1, len(labels) // 10)
|
|
1671
|
+
ticks = np.arange(0, len(labels), step)
|
|
1672
|
+
ax.set_xticks(ticks + 0.5)
|
|
1673
|
+
ax.set_xticklabels(
|
|
1674
|
+
[labels[i] for i in ticks],
|
|
1675
|
+
rotation=xtick_rotation,
|
|
1676
|
+
fontsize=xtick_fontsize,
|
|
1677
|
+
)
|
|
1678
|
+
|
|
1679
|
+
delta_df = _to_df(delta_span_layer_key)
|
|
1680
|
+
self_df = _to_df(self_span_layer_key)
|
|
1681
|
+
cross_df = _to_df(cross_span_layer_key)
|
|
1682
|
+
|
|
1683
|
+
# Drop columns that are all-zero/NaN across all three layers
|
|
1684
|
+
has_data = (
|
|
1685
|
+
delta_df.fillna(0).any(axis=0)
|
|
1686
|
+
| self_df.fillna(0).any(axis=0)
|
|
1687
|
+
| cross_df.fillna(0).any(axis=0)
|
|
1688
|
+
)
|
|
1689
|
+
delta_df = delta_df.loc[:, has_data]
|
|
1690
|
+
self_df = self_df.loc[:, has_data]
|
|
1691
|
+
cross_df = cross_df.loc[:, has_data]
|
|
1692
|
+
|
|
1693
|
+
# Hierarchical clustering on delta layer for row ordering
|
|
1694
|
+
delta_filled = delta_df.fillna(0)
|
|
1695
|
+
g = sns.clustermap(
|
|
1696
|
+
delta_filled,
|
|
1697
|
+
col_cluster=False,
|
|
1698
|
+
row_cluster=True,
|
|
1699
|
+
xticklabels=False,
|
|
1700
|
+
yticklabels=False,
|
|
1701
|
+
)
|
|
1702
|
+
row_order = g.dendrogram_row.reordered_ind
|
|
1703
|
+
ordered_index = delta_filled.index[row_order]
|
|
1704
|
+
plt.close(g.fig)
|
|
1705
|
+
|
|
1706
|
+
# Read span mask
|
|
1707
|
+
read_span_mask = None
|
|
1708
|
+
if read_span_layer and read_span_layer in subset.layers:
|
|
1709
|
+
rsm = subset.layers[read_span_layer]
|
|
1710
|
+
rsm = rsm.toarray() if hasattr(rsm, "toarray") else np.asarray(rsm)
|
|
1711
|
+
rsm_df = pd.DataFrame(rsm, index=subset.obs_names.astype(str), columns=subset.var_names)
|
|
1712
|
+
rsm_df = rsm_df.loc[:, has_data]
|
|
1713
|
+
read_span_mask = rsm_df.loc[ordered_index].to_numpy() == 0
|
|
1714
|
+
|
|
1715
|
+
panels = [
|
|
1716
|
+
(self_df, span_color, self_span_layer_key, "Self spans"),
|
|
1717
|
+
(cross_df, cross_span_color, cross_span_layer_key, "Cross spans"),
|
|
1718
|
+
(delta_df, delta_span_color, delta_span_layer_key, "Delta spans"),
|
|
1719
|
+
]
|
|
1720
|
+
|
|
1721
|
+
has_classification = bool(classification_obs_col) and classification_obs_col in subset.obs
|
|
1722
|
+
|
|
1723
|
+
fig = plt.figure(figsize=figsize)
|
|
1724
|
+
if has_classification:
|
|
1725
|
+
gs = fig.add_gridspec(
|
|
1726
|
+
2,
|
|
1727
|
+
4,
|
|
1728
|
+
height_ratios=[1, 6],
|
|
1729
|
+
width_ratios=[1, 1, 1, 0.12],
|
|
1730
|
+
wspace=0.08,
|
|
1731
|
+
hspace=0.05,
|
|
1732
|
+
)
|
|
1733
|
+
else:
|
|
1734
|
+
gs = fig.add_gridspec(
|
|
1735
|
+
2,
|
|
1736
|
+
3,
|
|
1737
|
+
height_ratios=[1, 6],
|
|
1738
|
+
wspace=0.08,
|
|
1739
|
+
hspace=0.05,
|
|
1740
|
+
)
|
|
1741
|
+
|
|
1742
|
+
axes = []
|
|
1743
|
+
for col_idx, (df, color, layer_name, panel_title) in enumerate(panels):
|
|
1744
|
+
ax_bar = fig.add_subplot(gs[0, col_idx])
|
|
1745
|
+
ax_heat = fig.add_subplot(gs[1, col_idx])
|
|
1746
|
+
axes.append(ax_heat)
|
|
1747
|
+
|
|
1748
|
+
ordered = df.loc[ordered_index].fillna(0)
|
|
1749
|
+
plot_data = ordered.copy()
|
|
1750
|
+
if read_span_mask is not None:
|
|
1751
|
+
plot_data = plot_data.mask(read_span_mask)
|
|
1752
|
+
|
|
1753
|
+
cmap = colors.ListedColormap(["white", color])
|
|
1754
|
+
norm = colors.BoundaryNorm([-0.5, 0.5, 1.5], cmap.N)
|
|
1755
|
+
cmap.set_bad(outside_read_color)
|
|
1756
|
+
|
|
1757
|
+
mean_span = np.nanmean(plot_data.to_numpy(), axis=0)
|
|
1758
|
+
clean_barplot(
|
|
1759
|
+
ax_bar, mean_span, panel_title, y_max=1.0, y_label="Span frac", y_ticks=[0.0, 0.5, 1.0]
|
|
1760
|
+
)
|
|
1761
|
+
|
|
1762
|
+
sns.heatmap(
|
|
1763
|
+
plot_data,
|
|
1764
|
+
ax=ax_heat,
|
|
1765
|
+
cmap=cmap,
|
|
1766
|
+
norm=norm,
|
|
1767
|
+
xticklabels=False,
|
|
1768
|
+
yticklabels=False,
|
|
1769
|
+
robust=robust,
|
|
1770
|
+
cbar=False,
|
|
1771
|
+
)
|
|
1772
|
+
_apply_xticks(ax_heat, [str(x) for x in ordered.columns], xtick_step)
|
|
1773
|
+
|
|
1774
|
+
if has_classification:
|
|
1775
|
+
class_values = (
|
|
1776
|
+
subset.obs.loc[ordered_index, classification_obs_col].astype(bool).astype(int)
|
|
1777
|
+
)
|
|
1778
|
+
class_df = pd.DataFrame(
|
|
1779
|
+
{classification_panel_title: class_values.to_numpy()},
|
|
1780
|
+
index=ordered_index,
|
|
1781
|
+
)
|
|
1782
|
+
class_cmap = colors.ListedColormap([classification_false_color, classification_true_color])
|
|
1783
|
+
class_norm = colors.BoundaryNorm([-0.5, 0.5, 1.5], class_cmap.N)
|
|
1784
|
+
|
|
1785
|
+
ax_class_top = fig.add_subplot(gs[0, 3])
|
|
1786
|
+
ax_class_top.axis("off")
|
|
1787
|
+
ax_class = fig.add_subplot(gs[1, 3], sharey=axes[-1])
|
|
1788
|
+
sns.heatmap(
|
|
1789
|
+
class_df,
|
|
1790
|
+
ax=ax_class,
|
|
1791
|
+
cmap=class_cmap,
|
|
1792
|
+
norm=class_norm,
|
|
1793
|
+
xticklabels=False,
|
|
1794
|
+
yticklabels=False,
|
|
1795
|
+
cbar=False,
|
|
1796
|
+
robust=robust,
|
|
1797
|
+
)
|
|
1798
|
+
|
|
1799
|
+
# Overlay variant call circles on all three panels
|
|
1800
|
+
if variant_call_data is not None:
|
|
1801
|
+
plotted_cols = list(self_df.loc[ordered_index].columns)
|
|
1802
|
+
plotted_col_set = set(plotted_cols)
|
|
1803
|
+
col_to_idx = {c: i for i, c in enumerate(plotted_cols)}
|
|
1804
|
+
|
|
1805
|
+
call_cols_mask = variant_call_data.isin([1, 2]).any(axis=0)
|
|
1806
|
+
call_col_names = variant_call_data.columns[call_cols_mask]
|
|
1807
|
+
|
|
1808
|
+
# Since no column subsetting, try exact match first; fall back to nearest
|
|
1809
|
+
try:
|
|
1810
|
+
plotted_positions = np.array([float(c) for c in plotted_cols])
|
|
1811
|
+
call_col_positions = np.array([float(c) for c in call_col_names])
|
|
1812
|
+
use_searchsorted = True
|
|
1813
|
+
except (ValueError, TypeError):
|
|
1814
|
+
use_searchsorted = False
|
|
1815
|
+
|
|
1816
|
+
heatmap_col_indices = {}
|
|
1817
|
+
for cn in call_col_names:
|
|
1818
|
+
if cn in col_to_idx:
|
|
1819
|
+
heatmap_col_indices[cn] = col_to_idx[cn]
|
|
1820
|
+
elif use_searchsorted:
|
|
1821
|
+
pos = float(cn)
|
|
1822
|
+
idx = np.searchsorted(plotted_positions, pos)
|
|
1823
|
+
idx = np.clip(idx, 0, len(plotted_positions) - 1)
|
|
1824
|
+
left = max(0, idx - 1)
|
|
1825
|
+
if abs(plotted_positions[left] - pos) < abs(plotted_positions[idx] - pos):
|
|
1826
|
+
idx = left
|
|
1827
|
+
heatmap_col_indices[cn] = idx
|
|
1828
|
+
|
|
1829
|
+
if heatmap_col_indices:
|
|
1830
|
+
active_cols = [c for c in call_col_names if c in heatmap_col_indices]
|
|
1831
|
+
call_sub = variant_call_data.loc[:, active_cols]
|
|
1832
|
+
call_sub.index = call_sub.index.astype(str)
|
|
1833
|
+
common_rows = [r for r in ordered_index if r in call_sub.index]
|
|
1834
|
+
if common_rows:
|
|
1835
|
+
call_ord = call_sub.loc[common_rows].to_numpy()
|
|
1836
|
+
row_index_map = {r: i for i, r in enumerate(ordered_index)}
|
|
1837
|
+
heatmap_row_indices = np.array([row_index_map[r] for r in common_rows])
|
|
1838
|
+
col_idx_arr = np.array([heatmap_col_indices[c] for c in active_cols])
|
|
1839
|
+
|
|
1840
|
+
for call_val, marker_color, label in [
|
|
1841
|
+
(1, ref1_marker_color, f"{seq1_label} call"),
|
|
1842
|
+
(2, ref2_marker_color, f"{seq2_label} call"),
|
|
1843
|
+
]:
|
|
1844
|
+
local_rows, local_cols = np.where(call_ord == call_val)
|
|
1845
|
+
if len(local_rows) == 0:
|
|
1846
|
+
continue
|
|
1847
|
+
plot_y = heatmap_row_indices[local_rows]
|
|
1848
|
+
plot_x = col_idx_arr[local_cols]
|
|
1849
|
+
for ax in axes:
|
|
1850
|
+
ax.scatter(
|
|
1851
|
+
plot_x + 0.5,
|
|
1852
|
+
plot_y + 0.5,
|
|
1853
|
+
c=marker_color,
|
|
1854
|
+
s=variant_marker_size,
|
|
1855
|
+
marker="o",
|
|
1856
|
+
edgecolors="gray",
|
|
1857
|
+
linewidths=0.3,
|
|
1858
|
+
zorder=3,
|
|
1859
|
+
label=label,
|
|
1860
|
+
)
|
|
1861
|
+
|
|
1862
|
+
# Add legend to rightmost axis
|
|
1863
|
+
handles, labels = axes[-1].get_legend_handles_labels()
|
|
1864
|
+
seen = {}
|
|
1865
|
+
unique_handles, unique_labels = [], []
|
|
1866
|
+
for h, la in zip(handles, labels):
|
|
1867
|
+
if la not in seen:
|
|
1868
|
+
seen[la] = True
|
|
1869
|
+
unique_handles.append(h)
|
|
1870
|
+
unique_labels.append(la)
|
|
1871
|
+
if unique_handles:
|
|
1872
|
+
legend_x_anchor = 1.3 if has_classification else 1.02
|
|
1873
|
+
axes[-1].legend(
|
|
1874
|
+
unique_handles,
|
|
1875
|
+
unique_labels,
|
|
1876
|
+
loc="upper left",
|
|
1877
|
+
bbox_to_anchor=(legend_x_anchor, 1.0),
|
|
1878
|
+
fontsize=8,
|
|
1879
|
+
framealpha=0.9,
|
|
1880
|
+
)
|
|
1881
|
+
|
|
1882
|
+
if title:
|
|
1883
|
+
fig.suptitle(title, fontsize=12)
|
|
1884
|
+
|
|
1885
|
+
if save_name is not None:
|
|
1886
|
+
fname = os.path.join(save_name)
|
|
1887
|
+
fig.savefig(fname, dpi=200, bbox_inches="tight")
|
|
1888
|
+
logger.info("Saved hamming span trio to %s.", fname)
|
|
1889
|
+
else:
|
|
1890
|
+
plt.show()
|
|
1891
|
+
|
|
1892
|
+
plt.close(fig)
|
|
1893
|
+
return ordered_index
|