smftools 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +7 -6
- smftools/_version.py +1 -1
- smftools/cli/cli_flows.py +94 -0
- smftools/cli/hmm_adata.py +338 -0
- smftools/cli/load_adata.py +577 -0
- smftools/cli/preprocess_adata.py +363 -0
- smftools/cli/spatial_adata.py +564 -0
- smftools/cli_entry.py +435 -0
- smftools/config/__init__.py +1 -0
- smftools/config/conversion.yaml +38 -0
- smftools/config/deaminase.yaml +61 -0
- smftools/config/default.yaml +264 -0
- smftools/config/direct.yaml +41 -0
- smftools/config/discover_input_files.py +115 -0
- smftools/config/experiment_config.py +1288 -0
- smftools/hmm/HMM.py +1576 -0
- smftools/hmm/__init__.py +20 -0
- smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
- smftools/hmm/call_hmm_peaks.py +106 -0
- smftools/{tools → hmm}/display_hmm.py +3 -3
- smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
- smftools/{tools → hmm}/train_hmm.py +1 -1
- smftools/informatics/__init__.py +13 -9
- smftools/informatics/archived/deaminase_smf.py +132 -0
- smftools/informatics/archived/fast5_to_pod5.py +43 -0
- smftools/informatics/archived/helpers/archived/__init__.py +71 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
- smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
- smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
- smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +30 -4
- smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
- smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
- smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +5 -4
- smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
- smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
- smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
- smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
- smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
- smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +8 -7
- smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +8 -12
- smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
- smftools/informatics/bam_functions.py +812 -0
- smftools/informatics/basecalling.py +67 -0
- smftools/informatics/bed_functions.py +366 -0
- smftools/informatics/binarize_converted_base_identities.py +172 -0
- smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +198 -50
- smftools/informatics/fasta_functions.py +255 -0
- smftools/informatics/h5ad_functions.py +197 -0
- smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +147 -61
- smftools/informatics/modkit_functions.py +129 -0
- smftools/informatics/ohe.py +160 -0
- smftools/informatics/pod5_functions.py +224 -0
- smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
- smftools/machine_learning/__init__.py +12 -0
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +234 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +31 -0
- smftools/machine_learning/evaluation/evaluators.py +223 -0
- smftools/machine_learning/inference/__init__.py +3 -0
- smftools/machine_learning/inference/inference_utils.py +27 -0
- smftools/machine_learning/inference/lightning_inference.py +68 -0
- smftools/machine_learning/inference/sklearn_inference.py +55 -0
- smftools/machine_learning/inference/sliding_window_inference.py +114 -0
- smftools/machine_learning/models/base.py +295 -0
- smftools/machine_learning/models/cnn.py +138 -0
- smftools/machine_learning/models/lightning_base.py +345 -0
- smftools/machine_learning/models/mlp.py +26 -0
- smftools/{tools → machine_learning}/models/positional.py +3 -2
- smftools/{tools → machine_learning}/models/rnn.py +2 -1
- smftools/machine_learning/models/sklearn_models.py +273 -0
- smftools/machine_learning/models/transformer.py +303 -0
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +135 -0
- smftools/machine_learning/training/train_sklearn_model.py +114 -0
- smftools/plotting/__init__.py +4 -1
- smftools/plotting/autocorrelation_plotting.py +609 -0
- smftools/plotting/general_plotting.py +1292 -140
- smftools/plotting/hmm_plotting.py +260 -0
- smftools/plotting/qc_plotting.py +270 -0
- smftools/preprocessing/__init__.py +15 -8
- smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
- smftools/preprocessing/append_base_context.py +122 -0
- smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
- smftools/preprocessing/binarize.py +17 -0
- smftools/preprocessing/binarize_on_Youden.py +2 -2
- smftools/preprocessing/calculate_complexity_II.py +248 -0
- smftools/preprocessing/calculate_coverage.py +10 -1
- smftools/preprocessing/calculate_position_Youden.py +1 -1
- smftools/preprocessing/calculate_read_modification_stats.py +101 -0
- smftools/preprocessing/clean_NaN.py +17 -1
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
- smftools/preprocessing/flag_duplicate_reads.py +1326 -124
- smftools/preprocessing/invert_adata.py +12 -5
- smftools/preprocessing/load_sample_sheet.py +19 -4
- smftools/readwrite.py +1021 -89
- smftools/tools/__init__.py +3 -32
- smftools/tools/calculate_umap.py +5 -5
- smftools/tools/general_tools.py +3 -3
- smftools/tools/position_stats.py +468 -106
- smftools/tools/read_stats.py +115 -1
- smftools/tools/spatial_autocorrelation.py +562 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/METADATA +14 -9
- smftools-0.2.3.dist-info/RECORD +173 -0
- smftools-0.2.3.dist-info/entry_points.txt +2 -0
- smftools/informatics/fast5_to_pod5.py +0 -21
- smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
- smftools/informatics/helpers/__init__.py +0 -74
- smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
- smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
- smftools/informatics/helpers/bam_qc.py +0 -66
- smftools/informatics/helpers/bed_to_bigwig.py +0 -39
- smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
- smftools/informatics/helpers/index_fasta.py +0 -12
- smftools/informatics/helpers/make_dirs.py +0 -21
- smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
- smftools/informatics/load_adata.py +0 -182
- smftools/informatics/readwrite.py +0 -106
- smftools/informatics/subsample_fasta_from_bed.py +0 -47
- smftools/preprocessing/append_C_context.py +0 -82
- smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
- smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
- smftools/preprocessing/filter_reads_on_length.py +0 -51
- smftools/tools/call_hmm_peaks.py +0 -105
- smftools/tools/data/__init__.py +0 -2
- smftools/tools/data/anndata_data_module.py +0 -90
- smftools/tools/inference/__init__.py +0 -1
- smftools/tools/inference/lightning_inference.py +0 -41
- smftools/tools/models/base.py +0 -14
- smftools/tools/models/cnn.py +0 -34
- smftools/tools/models/lightning_base.py +0 -41
- smftools/tools/models/mlp.py +0 -17
- smftools/tools/models/sklearn_models.py +0 -40
- smftools/tools/models/transformer.py +0 -133
- smftools/tools/training/__init__.py +0 -1
- smftools/tools/training/train_lightning_model.py +0 -47
- smftools-0.1.7.dist-info/RECORD +0 -136
- /smftools/{tools/evaluation → cli}/__init__.py +0 -0
- /smftools/{tools → hmm}/calculate_distances.py +0 -0
- /smftools/{tools → hmm}/hmm_readwrite.py +0 -0
- /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
- /smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
- /smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
- /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
- /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
- /smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
- /smftools/{tools → machine_learning}/models/__init__.py +0 -0
- /smftools/{tools → machine_learning}/models/wrappers.py +0 -0
- /smftools/{tools → machine_learning}/utils/__init__.py +0 -0
- /smftools/{tools → machine_learning}/utils/device.py +0 -0
- /smftools/{tools → machine_learning}/utils/grl.py +0 -0
- /smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
- /smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
import math
|
|
2
|
+
from typing import List, Optional, Tuple, Union
|
|
3
|
+
import numpy as np
|
|
4
|
+
import matplotlib.pyplot as plt
|
|
5
|
+
from matplotlib.backends.backend_pdf import PdfPages
|
|
6
|
+
|
|
7
|
+
def plot_hmm_size_contours(
|
|
8
|
+
adata,
|
|
9
|
+
length_layer: str,
|
|
10
|
+
sample_col: str,
|
|
11
|
+
ref_obs_col: str,
|
|
12
|
+
rows_per_page: int = 4,
|
|
13
|
+
max_length_cap: Optional[int] = 1000,
|
|
14
|
+
figsize_per_cell: Tuple[float, float] = (4.0, 2.5),
|
|
15
|
+
cmap: str = "viridis",
|
|
16
|
+
log_scale_z: bool = False,
|
|
17
|
+
save_path: Optional[str] = None,
|
|
18
|
+
save_pdf: bool = True,
|
|
19
|
+
save_each_page: bool = False,
|
|
20
|
+
dpi: int = 150,
|
|
21
|
+
vmin: Optional[float] = None,
|
|
22
|
+
vmax: Optional[float] = None,
|
|
23
|
+
# ---------------- smoothing params ----------------
|
|
24
|
+
smoothing_sigma: Optional[Union[float, Tuple[float, float]]] = None,
|
|
25
|
+
normalize_after_smoothing: bool = True,
|
|
26
|
+
use_scipy_if_available: bool = True,
|
|
27
|
+
):
|
|
28
|
+
"""
|
|
29
|
+
Create contour/pcolormesh plots of P(length | position) using a length-encoded HMM layer.
|
|
30
|
+
Optional Gaussian smoothing applied to the 2D probability grid before plotting.
|
|
31
|
+
|
|
32
|
+
smoothing_sigma: None or 0 -> no smoothing.
|
|
33
|
+
float -> same sigma applied to (length_axis, position_axis)
|
|
34
|
+
(sigma_len, sigma_pos) -> separate sigmas.
|
|
35
|
+
normalize_after_smoothing: if True, renormalize each position-column to sum to 1 after smoothing.
|
|
36
|
+
|
|
37
|
+
Other args are the same as prior function.
|
|
38
|
+
"""
|
|
39
|
+
# --- helper: gaussian smoothing (scipy fallback -> numpy separable conv) ---
|
|
40
|
+
def _gaussian_1d_kernel(sigma: float, eps: float = 1e-12):
|
|
41
|
+
if sigma <= 0 or sigma is None:
|
|
42
|
+
return np.array([1.0], dtype=float)
|
|
43
|
+
# choose kernel size = odd ~ 6*sigma (covers +/-3 sigma)
|
|
44
|
+
radius = max(1, int(math.ceil(3.0 * float(sigma))))
|
|
45
|
+
xs = np.arange(-radius, radius + 1, dtype=float)
|
|
46
|
+
k = np.exp(-(xs ** 2) / (2.0 * sigma ** 2))
|
|
47
|
+
k_sum = k.sum()
|
|
48
|
+
if k_sum <= eps:
|
|
49
|
+
k = np.array([1.0], dtype=float)
|
|
50
|
+
k_sum = 1.0
|
|
51
|
+
return k / k_sum
|
|
52
|
+
|
|
53
|
+
def _smooth_with_numpy_separable(Z: np.ndarray, sigma_len: float, sigma_pos: float) -> np.ndarray:
|
|
54
|
+
# Z shape: (n_lengths, n_positions)
|
|
55
|
+
out = Z.copy()
|
|
56
|
+
# smooth along length axis (axis=0)
|
|
57
|
+
if sigma_len and sigma_len > 0:
|
|
58
|
+
k_len = _gaussian_1d_kernel(sigma_len)
|
|
59
|
+
# convolve each column
|
|
60
|
+
out = np.apply_along_axis(lambda col: np.convolve(col, k_len, mode="same"), axis=0, arr=out)
|
|
61
|
+
# smooth along position axis (axis=1)
|
|
62
|
+
if sigma_pos and sigma_pos > 0:
|
|
63
|
+
k_pos = _gaussian_1d_kernel(sigma_pos)
|
|
64
|
+
out = np.apply_along_axis(lambda row: np.convolve(row, k_pos, mode="same"), axis=1, arr=out)
|
|
65
|
+
return out
|
|
66
|
+
|
|
67
|
+
# prefer scipy.ndimage if available (faster and better boundary handling)
|
|
68
|
+
_have_scipy = False
|
|
69
|
+
if use_scipy_if_available:
|
|
70
|
+
try:
|
|
71
|
+
from scipy.ndimage import gaussian_filter as _scipy_gaussian_filter
|
|
72
|
+
_have_scipy = True
|
|
73
|
+
except Exception:
|
|
74
|
+
_have_scipy = False
|
|
75
|
+
|
|
76
|
+
def _smooth_Z(Z: np.ndarray, sigma_len: float, sigma_pos: float) -> np.ndarray:
|
|
77
|
+
if (sigma_len is None or sigma_len == 0) and (sigma_pos is None or sigma_pos == 0):
|
|
78
|
+
return Z
|
|
79
|
+
if _have_scipy:
|
|
80
|
+
# scipy expects sigma sequence in axis order (axis=0 length, axis=1 pos)
|
|
81
|
+
sigma_seq = (float(sigma_len or 0.0), float(sigma_pos or 0.0))
|
|
82
|
+
return _scipy_gaussian_filter(Z, sigma=sigma_seq, mode="reflect")
|
|
83
|
+
else:
|
|
84
|
+
return _smooth_with_numpy_separable(Z, float(sigma_len or 0.0), float(sigma_pos or 0.0))
|
|
85
|
+
|
|
86
|
+
# --- gather unique ordered labels ---
|
|
87
|
+
samples = list(adata.obs[sample_col].cat.categories) if getattr(adata.obs[sample_col], "dtype", None) == "category" else list(pd.Categorical(adata.obs[sample_col]).categories)
|
|
88
|
+
refs = list(adata.obs[ref_obs_col].cat.categories) if getattr(adata.obs[ref_obs_col], "dtype", None) == "category" else list(pd.Categorical(adata.obs[ref_obs_col]).categories)
|
|
89
|
+
|
|
90
|
+
n_samples = len(samples)
|
|
91
|
+
n_refs = len(refs)
|
|
92
|
+
if n_samples == 0 or n_refs == 0:
|
|
93
|
+
raise ValueError("No samples or references found for plotting.")
|
|
94
|
+
|
|
95
|
+
# Try to get numeric coordinates for x axis; fallback to range indices
|
|
96
|
+
try:
|
|
97
|
+
coords = np.asarray(adata.var_names, dtype=int)
|
|
98
|
+
x_ticks_is_positions = True
|
|
99
|
+
except Exception:
|
|
100
|
+
coords = np.arange(adata.shape[1], dtype=int)
|
|
101
|
+
x_ticks_is_positions = False
|
|
102
|
+
|
|
103
|
+
# helper to get dense layer array for subset
|
|
104
|
+
def _get_layer_array(layer):
|
|
105
|
+
arr = layer
|
|
106
|
+
# sparse -> toarray
|
|
107
|
+
if hasattr(arr, "toarray"):
|
|
108
|
+
arr = arr.toarray()
|
|
109
|
+
return np.asarray(arr)
|
|
110
|
+
|
|
111
|
+
# fetch the whole layer once (not necessary but helps)
|
|
112
|
+
if length_layer not in adata.layers:
|
|
113
|
+
raise KeyError(f"Layer {length_layer} not found in adata.layers")
|
|
114
|
+
full_layer = _get_layer_array(adata.layers[length_layer]) # shape (n_obs, n_vars)
|
|
115
|
+
|
|
116
|
+
# Precompute pages
|
|
117
|
+
pages = math.ceil(n_samples / rows_per_page)
|
|
118
|
+
figs = []
|
|
119
|
+
|
|
120
|
+
# decide global max length to allocate y axis (cap to avoid huge memory)
|
|
121
|
+
observed_max_len = int(np.max(full_layer)) if full_layer.size > 0 else 0
|
|
122
|
+
if max_length_cap is None:
|
|
123
|
+
max_len = observed_max_len
|
|
124
|
+
else:
|
|
125
|
+
max_len = min(int(max_length_cap), max(1, observed_max_len))
|
|
126
|
+
if max_len < 1:
|
|
127
|
+
max_len = 1
|
|
128
|
+
|
|
129
|
+
# parse smoothing_sigma
|
|
130
|
+
if smoothing_sigma is None or smoothing_sigma == 0:
|
|
131
|
+
sigma_len, sigma_pos = 0.0, 0.0
|
|
132
|
+
elif isinstance(smoothing_sigma, (int, float)):
|
|
133
|
+
sigma_len = float(smoothing_sigma)
|
|
134
|
+
sigma_pos = float(smoothing_sigma)
|
|
135
|
+
else:
|
|
136
|
+
sigma_len = float(smoothing_sigma[0])
|
|
137
|
+
sigma_pos = float(smoothing_sigma[1])
|
|
138
|
+
|
|
139
|
+
# iterate pages
|
|
140
|
+
for p in range(pages):
|
|
141
|
+
start_sample = p * rows_per_page
|
|
142
|
+
end_sample = min(n_samples, (p + 1) * rows_per_page)
|
|
143
|
+
page_samples = samples[start_sample:end_sample]
|
|
144
|
+
rows_on_page = len(page_samples)
|
|
145
|
+
|
|
146
|
+
fig_w = n_refs * figsize_per_cell[0]
|
|
147
|
+
fig_h = rows_on_page * figsize_per_cell[1]
|
|
148
|
+
fig, axes = plt.subplots(rows_on_page, n_refs, figsize=(fig_w, fig_h), squeeze=False)
|
|
149
|
+
fig.suptitle(f"HMM size contours (page {p+1}/{pages})", fontsize=12)
|
|
150
|
+
|
|
151
|
+
# for each panel compute p(length | position)
|
|
152
|
+
for i_row, sample in enumerate(page_samples):
|
|
153
|
+
for j_col, ref in enumerate(refs):
|
|
154
|
+
ax = axes[i_row][j_col]
|
|
155
|
+
panel_mask = (adata.obs[sample_col] == sample) & (adata.obs[ref_obs_col] == ref)
|
|
156
|
+
if not panel_mask.any():
|
|
157
|
+
ax.text(0.5, 0.5, "no reads", ha="center", va="center")
|
|
158
|
+
ax.set_xticks([])
|
|
159
|
+
ax.set_yticks([])
|
|
160
|
+
ax.set_title(f"{sample} / {ref}")
|
|
161
|
+
continue
|
|
162
|
+
|
|
163
|
+
row_idx = np.nonzero(panel_mask.values if hasattr(panel_mask, "values") else np.asarray(panel_mask))[0]
|
|
164
|
+
if row_idx.size == 0:
|
|
165
|
+
ax.text(0.5, 0.5, "no reads", ha="center", va="center")
|
|
166
|
+
ax.set_title(f"{sample} / {ref}")
|
|
167
|
+
continue
|
|
168
|
+
|
|
169
|
+
sub = full_layer[row_idx, :] # (n_reads, n_positions)
|
|
170
|
+
if sub.size == 0:
|
|
171
|
+
ax.text(0.5, 0.5, "no data", ha="center", va="center")
|
|
172
|
+
ax.set_title(f"{sample} / {ref}")
|
|
173
|
+
continue
|
|
174
|
+
|
|
175
|
+
# compute counts per length per position
|
|
176
|
+
n_positions = sub.shape[1]
|
|
177
|
+
max_len_local = int(sub.max()) if sub.size > 0 else 0
|
|
178
|
+
max_len_here = min(max_len, max_len_local)
|
|
179
|
+
|
|
180
|
+
lengths_range = np.arange(1, max_len_here + 1, dtype=int)
|
|
181
|
+
Z = np.zeros((len(lengths_range), n_positions), dtype=float) # rows=length, cols=pos
|
|
182
|
+
|
|
183
|
+
# fill Z by efficient bincount across columns
|
|
184
|
+
for j in range(n_positions):
|
|
185
|
+
col_vals = sub[:, j]
|
|
186
|
+
pos_vals = col_vals[col_vals > 0].astype(int)
|
|
187
|
+
if pos_vals.size == 0:
|
|
188
|
+
continue
|
|
189
|
+
clipped = np.clip(pos_vals, 1, max_len_here)
|
|
190
|
+
counts = np.bincount(clipped, minlength=max_len_here + 1)[1:]
|
|
191
|
+
s = counts.sum()
|
|
192
|
+
if s > 0:
|
|
193
|
+
Z[:, j] = counts.astype(float) # keep counts for smoothing
|
|
194
|
+
|
|
195
|
+
# normalize per-column -> p(length | pos) BEFORE smoothing OR AFTER
|
|
196
|
+
# We'll smooth counts and then optionally renormalize (normalize_after_smoothing controls)
|
|
197
|
+
# Apply smoothing to Z (counts)
|
|
198
|
+
if sigma_len > 0 or sigma_pos > 0:
|
|
199
|
+
Z = _smooth_Z(Z, sigma_len, sigma_pos)
|
|
200
|
+
|
|
201
|
+
# normalize to conditional probability per column
|
|
202
|
+
if normalize_after_smoothing:
|
|
203
|
+
col_sums = Z.sum(axis=0, keepdims=True)
|
|
204
|
+
# avoid divide-by-zero
|
|
205
|
+
col_sums[col_sums == 0] = 1.0
|
|
206
|
+
Z = Z / col_sums
|
|
207
|
+
|
|
208
|
+
if log_scale_z:
|
|
209
|
+
Z_plot = np.log1p(Z)
|
|
210
|
+
else:
|
|
211
|
+
Z_plot = Z
|
|
212
|
+
|
|
213
|
+
# Build x and y grids for pcolormesh: x = coords (positions)
|
|
214
|
+
x = coords[:n_positions]
|
|
215
|
+
if n_positions >= 2:
|
|
216
|
+
dx = np.diff(x).mean()
|
|
217
|
+
x_edges = np.concatenate([x - dx / 2.0, [x[-1] + dx / 2.0]])
|
|
218
|
+
else:
|
|
219
|
+
x_edges = np.array([x[0] - 0.5, x[0] + 0.5])
|
|
220
|
+
|
|
221
|
+
y = lengths_range
|
|
222
|
+
dy = 1.0
|
|
223
|
+
y_edges = np.concatenate([y - 0.5, [y[-1] + 0.5]])
|
|
224
|
+
|
|
225
|
+
pcm = ax.pcolormesh(x_edges, y_edges, Z_plot, cmap=cmap, shading="auto", vmin=vmin, vmax=vmax)
|
|
226
|
+
ax.set_title(f"{sample} / {ref}")
|
|
227
|
+
ax.set_ylabel("length")
|
|
228
|
+
if i_row == rows_on_page - 1:
|
|
229
|
+
ax.set_xlabel("position")
|
|
230
|
+
else:
|
|
231
|
+
ax.set_xticklabels([])
|
|
232
|
+
|
|
233
|
+
# colorbar
|
|
234
|
+
fig.subplots_adjust(right=0.88)
|
|
235
|
+
cax = fig.add_axes([0.9, 0.15, 0.02, 0.7])
|
|
236
|
+
try:
|
|
237
|
+
fig.colorbar(pcm, cax=cax)
|
|
238
|
+
except Exception:
|
|
239
|
+
pass
|
|
240
|
+
|
|
241
|
+
figs.append(fig)
|
|
242
|
+
|
|
243
|
+
# saving per page if requested
|
|
244
|
+
if save_path is not None:
|
|
245
|
+
import os
|
|
246
|
+
os.makedirs(save_path, exist_ok=True)
|
|
247
|
+
if save_each_page:
|
|
248
|
+
fname = f"hmm_size_page_{p+1:03d}.png"
|
|
249
|
+
out = os.path.join(save_path, fname)
|
|
250
|
+
fig.savefig(out, dpi=dpi, bbox_inches="tight")
|
|
251
|
+
|
|
252
|
+
# multipage PDF if requested
|
|
253
|
+
if save_path is not None and save_pdf:
|
|
254
|
+
pdf_file = os.path.join(save_path, "hmm_size_contours_pages.pdf")
|
|
255
|
+
with PdfPages(pdf_file) as pp:
|
|
256
|
+
for fig in figs:
|
|
257
|
+
pp.savefig(fig, bbox_inches="tight")
|
|
258
|
+
print(f"Saved multipage PDF: {pdf_file}")
|
|
259
|
+
|
|
260
|
+
return figs
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import matplotlib.pyplot as plt
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
import matplotlib.pyplot as plt
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def plot_read_qc_histograms(
|
|
13
|
+
adata,
|
|
14
|
+
outdir,
|
|
15
|
+
obs_keys,
|
|
16
|
+
sample_key,
|
|
17
|
+
bins=60,
|
|
18
|
+
clip_quantiles=(0.0, 0.995),
|
|
19
|
+
min_non_nan=10,
|
|
20
|
+
rows_per_fig=6,
|
|
21
|
+
topn_categories=15,
|
|
22
|
+
figsize_cell=(3.6, 2.6),
|
|
23
|
+
dpi=150,
|
|
24
|
+
):
|
|
25
|
+
"""
|
|
26
|
+
Plot a grid of QC histograms: rows = samples (from `sample_key`), columns = `obs_keys`.
|
|
27
|
+
|
|
28
|
+
Numeric columns -> histogram per sample.
|
|
29
|
+
Categorical columns -> bar chart of top categories per sample.
|
|
30
|
+
|
|
31
|
+
Saves paginated PNGs to `outdir`.
|
|
32
|
+
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
adata : AnnData
|
|
36
|
+
outdir : str
|
|
37
|
+
obs_keys : list[str]
|
|
38
|
+
sample_key : str
|
|
39
|
+
Column in adata.obs defining rows (samples/barcodes).
|
|
40
|
+
bins : int
|
|
41
|
+
Histogram bins for numeric metrics.
|
|
42
|
+
clip_quantiles : tuple or None
|
|
43
|
+
Clip numeric data globally per metric for consistent axes, e.g. (0.0, 0.995).
|
|
44
|
+
min_non_nan : int
|
|
45
|
+
Minimum finite values to plot a panel.
|
|
46
|
+
rows_per_fig : int
|
|
47
|
+
Number of samples per page.
|
|
48
|
+
topn_categories : int
|
|
49
|
+
For categorical metrics, show top-N categories (per sample).
|
|
50
|
+
figsize_cell : (float, float)
|
|
51
|
+
Size of each subplot cell (width, height).
|
|
52
|
+
dpi : int
|
|
53
|
+
Figure resolution.
|
|
54
|
+
"""
|
|
55
|
+
os.makedirs(outdir, exist_ok=True)
|
|
56
|
+
|
|
57
|
+
if sample_key not in adata.obs.columns:
|
|
58
|
+
raise KeyError(f"'{sample_key}' not found in adata.obs")
|
|
59
|
+
|
|
60
|
+
# Ensure sample_key is categorical for stable ordering
|
|
61
|
+
samples = adata.obs[sample_key]
|
|
62
|
+
if not pd.api.types.is_categorical_dtype(samples):
|
|
63
|
+
samples = samples.astype("category")
|
|
64
|
+
sample_levels = list(samples.cat.categories)
|
|
65
|
+
|
|
66
|
+
# Validate keys, and classify numeric vs categorical
|
|
67
|
+
valid_keys = []
|
|
68
|
+
is_numeric = {}
|
|
69
|
+
for key in obs_keys:
|
|
70
|
+
if key not in adata.obs.columns:
|
|
71
|
+
print(f"[WARN] '{key}' not found in obs; skipping.")
|
|
72
|
+
continue
|
|
73
|
+
s = adata.obs[key]
|
|
74
|
+
num = pd.api.types.is_numeric_dtype(s)
|
|
75
|
+
valid_keys.append(key)
|
|
76
|
+
is_numeric[key] = num
|
|
77
|
+
if not valid_keys:
|
|
78
|
+
print("[plot_read_qc_grid] No valid obs_keys to plot.")
|
|
79
|
+
return
|
|
80
|
+
|
|
81
|
+
# Precompute global numeric ranges (after clipping) so rows share x-axis per column
|
|
82
|
+
global_ranges = {}
|
|
83
|
+
for key in valid_keys:
|
|
84
|
+
if not is_numeric[key]:
|
|
85
|
+
continue
|
|
86
|
+
s = pd.to_numeric(adata.obs[key], errors="coerce").replace([np.inf, -np.inf], np.nan).dropna()
|
|
87
|
+
if s.size < min_non_nan:
|
|
88
|
+
# still set something to avoid errors; just use min/max or (0,1)
|
|
89
|
+
lo, hi = (0.0, 1.0) if s.size == 0 else (float(s.min()), float(s.max()))
|
|
90
|
+
else:
|
|
91
|
+
if clip_quantiles:
|
|
92
|
+
qlo = s.quantile(clip_quantiles[0]) if clip_quantiles[0] is not None else s.min()
|
|
93
|
+
qhi = s.quantile(clip_quantiles[1]) if clip_quantiles[1] is not None else s.max()
|
|
94
|
+
lo, hi = float(qlo), float(qhi)
|
|
95
|
+
if not (np.isfinite(lo) and np.isfinite(hi) and hi > lo):
|
|
96
|
+
lo, hi = float(s.min()), float(s.max())
|
|
97
|
+
else:
|
|
98
|
+
lo, hi = float(s.min()), float(s.max())
|
|
99
|
+
global_ranges[key] = (lo, hi)
|
|
100
|
+
|
|
101
|
+
def _sanitize(name: str) -> str:
|
|
102
|
+
return "".join(c if c.isalnum() or c in "-._" else "_" for c in str(name))
|
|
103
|
+
|
|
104
|
+
ncols = len(valid_keys)
|
|
105
|
+
fig_w = figsize_cell[0] * ncols
|
|
106
|
+
# rows per page is rows_per_fig; figure height scales accordingly
|
|
107
|
+
fig_h_unit = figsize_cell[1]
|
|
108
|
+
|
|
109
|
+
for start in range(0, len(sample_levels), rows_per_fig):
|
|
110
|
+
chunk = sample_levels[start:start + rows_per_fig]
|
|
111
|
+
nrows = len(chunk)
|
|
112
|
+
fig, axes = plt.subplots(
|
|
113
|
+
nrows=nrows, ncols=ncols,
|
|
114
|
+
figsize=(fig_w, fig_h_unit * nrows),
|
|
115
|
+
dpi=dpi,
|
|
116
|
+
squeeze=False,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
for r, sample_val in enumerate(chunk):
|
|
120
|
+
row_mask = (adata.obs[sample_key].values == sample_val)
|
|
121
|
+
n_in_row = int(row_mask.sum())
|
|
122
|
+
|
|
123
|
+
for c, key in enumerate(valid_keys):
|
|
124
|
+
ax = axes[r, c]
|
|
125
|
+
series = adata.obs.loc[row_mask, key]
|
|
126
|
+
|
|
127
|
+
if is_numeric[key]:
|
|
128
|
+
x = pd.to_numeric(series, errors="coerce").replace([np.inf, -np.inf], np.nan).dropna()
|
|
129
|
+
if x.size < min_non_nan:
|
|
130
|
+
ax.text(0.5, 0.5, f"n={x.size} (<{min_non_nan})", ha="center", va="center")
|
|
131
|
+
else:
|
|
132
|
+
# clip to global range for consistent axes
|
|
133
|
+
lo, hi = global_ranges[key]
|
|
134
|
+
x = x.clip(lo, hi)
|
|
135
|
+
ax.hist(x.values, bins=bins, range=(lo, hi), edgecolor="black", alpha=0.7)
|
|
136
|
+
ax.set_xlim(lo, hi)
|
|
137
|
+
if r == 0:
|
|
138
|
+
ax.set_title(key)
|
|
139
|
+
if c == 0:
|
|
140
|
+
ax.set_ylabel(f"{sample_val}\n(n={n_in_row})")
|
|
141
|
+
ax.grid(alpha=0.25)
|
|
142
|
+
ax.set_xlabel("") # keep uncluttered; x-limit conveys scale
|
|
143
|
+
else:
|
|
144
|
+
vc = series.astype("category").value_counts(dropna=False)
|
|
145
|
+
if vc.sum() < min_non_nan:
|
|
146
|
+
ax.text(0.5, 0.5, f"n={vc.sum()} (<{min_non_nan})", ha="center", va="center")
|
|
147
|
+
else:
|
|
148
|
+
vc_top = vc.iloc[:topn_categories][::-1] # show top-N, reversed for barh
|
|
149
|
+
ax.barh(vc_top.index.astype(str), vc_top.values)
|
|
150
|
+
ax.invert_yaxis()
|
|
151
|
+
if r == 0:
|
|
152
|
+
ax.set_title(f"{key} (cat)")
|
|
153
|
+
if c == 0:
|
|
154
|
+
ax.set_ylabel(f"{sample_val}\n(n={n_in_row})")
|
|
155
|
+
ax.grid(alpha=0.25)
|
|
156
|
+
# trim labels to reduce clutter
|
|
157
|
+
if vc.sum() >= min_non_nan:
|
|
158
|
+
ax.tick_params(axis="y", labelsize=8)
|
|
159
|
+
|
|
160
|
+
plt.tight_layout()
|
|
161
|
+
page = start // rows_per_fig + 1
|
|
162
|
+
out_png = os.path.join(outdir, f"qc_grid_{_sanitize(sample_key)}_page{page}.png")
|
|
163
|
+
plt.savefig(out_png, bbox_inches="tight")
|
|
164
|
+
plt.close(fig)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
# def plot_read_qc_histograms(
|
|
168
|
+
# adata,
|
|
169
|
+
# outdir,
|
|
170
|
+
# obs_keys,
|
|
171
|
+
# sample_key=None,
|
|
172
|
+
# *,
|
|
173
|
+
# bins=100,
|
|
174
|
+
# clip_quantiles=(0.0, 0.995),
|
|
175
|
+
# min_non_nan=10,
|
|
176
|
+
# figsize=(6, 4),
|
|
177
|
+
# dpi=150
|
|
178
|
+
# ):
|
|
179
|
+
# """
|
|
180
|
+
# Plots histograms for given obs_keys, optionally grouped by sample_key.
|
|
181
|
+
|
|
182
|
+
# Parameters
|
|
183
|
+
# ----------
|
|
184
|
+
# adata : AnnData
|
|
185
|
+
# AnnData object.
|
|
186
|
+
# outdir : str
|
|
187
|
+
# Output directory for PNG files.
|
|
188
|
+
# obs_keys : list[str]
|
|
189
|
+
# List of obs columns to plot.
|
|
190
|
+
# sample_key : str or None
|
|
191
|
+
# Column in adata.obs to group by (e.g., 'Barcode').
|
|
192
|
+
# If None, plots are for the full dataset only.
|
|
193
|
+
# bins : int
|
|
194
|
+
# Number of histogram bins for numeric data.
|
|
195
|
+
# clip_quantiles : tuple or None
|
|
196
|
+
# (low_q, high_q) to clip extreme values for plotting.
|
|
197
|
+
# min_non_nan : int
|
|
198
|
+
# Minimum number of finite values to plot.
|
|
199
|
+
# figsize : tuple
|
|
200
|
+
# Figure size.
|
|
201
|
+
# dpi : int
|
|
202
|
+
# Figure resolution.
|
|
203
|
+
# """
|
|
204
|
+
# os.makedirs(outdir, exist_ok=True)
|
|
205
|
+
|
|
206
|
+
# # Define grouping
|
|
207
|
+
# if sample_key and sample_key in adata.obs.columns:
|
|
208
|
+
# groups = adata.obs.groupby(sample_key)
|
|
209
|
+
# else:
|
|
210
|
+
# groups = [(None, adata.obs)] # single group
|
|
211
|
+
|
|
212
|
+
# for group_name, group_df in groups:
|
|
213
|
+
# # For each metric
|
|
214
|
+
# for key in obs_keys:
|
|
215
|
+
# if key not in group_df.columns:
|
|
216
|
+
# print(f"[WARN] '{key}' not found in obs; skipping.")
|
|
217
|
+
# continue
|
|
218
|
+
|
|
219
|
+
# series = group_df[key]
|
|
220
|
+
|
|
221
|
+
# # Numeric columns
|
|
222
|
+
# if pd.api.types.is_numeric_dtype(series):
|
|
223
|
+
# x = pd.to_numeric(series, errors="coerce").replace([np.inf, -np.inf], np.nan).dropna()
|
|
224
|
+
# if len(x) < min_non_nan:
|
|
225
|
+
# continue
|
|
226
|
+
|
|
227
|
+
# # Clip for better visualization
|
|
228
|
+
# if clip_quantiles:
|
|
229
|
+
# lo = x.quantile(clip_quantiles[0]) if clip_quantiles[0] is not None else x.min()
|
|
230
|
+
# hi = x.quantile(clip_quantiles[1]) if clip_quantiles[1] is not None else x.max()
|
|
231
|
+
# if np.isfinite(lo) and np.isfinite(hi) and hi > lo:
|
|
232
|
+
# x = x.clip(lo, hi)
|
|
233
|
+
|
|
234
|
+
# fig, ax = plt.subplots(figsize=figsize, dpi=dpi)
|
|
235
|
+
# ax.hist(x, bins=bins, edgecolor="black", alpha=0.7)
|
|
236
|
+
# ax.set_xlabel(key)
|
|
237
|
+
# ax.set_ylabel("Count")
|
|
238
|
+
|
|
239
|
+
# title = f"{key}" if group_name is None else f"{key} — {sample_key}={group_name}"
|
|
240
|
+
# ax.set_title(title)
|
|
241
|
+
|
|
242
|
+
# plt.tight_layout()
|
|
243
|
+
|
|
244
|
+
# # Save PNG
|
|
245
|
+
# safe_group = "all" if group_name is None else str(group_name)
|
|
246
|
+
# fname = f"{key}_{sample_key}_{safe_group}.png" if sample_key else f"{key}.png"
|
|
247
|
+
# fname = fname.replace("/", "_")
|
|
248
|
+
# fig.savefig(os.path.join(outdir, fname))
|
|
249
|
+
# plt.close(fig)
|
|
250
|
+
|
|
251
|
+
# else:
|
|
252
|
+
# # Categorical columns
|
|
253
|
+
# vc = series.astype("category").value_counts(dropna=False)
|
|
254
|
+
# if vc.sum() < min_non_nan:
|
|
255
|
+
# continue
|
|
256
|
+
|
|
257
|
+
# fig, ax = plt.subplots(figsize=figsize, dpi=dpi)
|
|
258
|
+
# vc.plot(kind="barh", ax=ax)
|
|
259
|
+
# ax.set_xlabel("Count")
|
|
260
|
+
|
|
261
|
+
# title = f"{key} (categorical)" if group_name is None else f"{key} — {sample_key}={group_name}"
|
|
262
|
+
# ax.set_title(title)
|
|
263
|
+
|
|
264
|
+
# plt.tight_layout()
|
|
265
|
+
|
|
266
|
+
# safe_group = "all" if group_name is None else str(group_name)
|
|
267
|
+
# fname = f"{key}_{sample_key}_{safe_group}.png" if sample_key else f"{key}.png"
|
|
268
|
+
# fname = fname.replace("/", "_")
|
|
269
|
+
# fig.savefig(os.path.join(outdir, fname))
|
|
270
|
+
# plt.close(fig)
|
|
@@ -1,31 +1,38 @@
|
|
|
1
|
-
from .
|
|
1
|
+
from .add_read_length_and_mapping_qc import add_read_length_and_mapping_qc
|
|
2
|
+
from .append_base_context import append_base_context
|
|
3
|
+
from .append_binary_layer_by_base_context import append_binary_layer_by_base_context
|
|
2
4
|
from .binarize_on_Youden import binarize_on_Youden
|
|
5
|
+
from .binarize import binarize_adata
|
|
3
6
|
from .calculate_complexity import calculate_complexity
|
|
4
|
-
from .
|
|
7
|
+
from .calculate_complexity_II import calculate_complexity_II
|
|
8
|
+
from .calculate_read_modification_stats import calculate_read_modification_stats
|
|
5
9
|
from .calculate_coverage import calculate_coverage
|
|
6
10
|
from .calculate_position_Youden import calculate_position_Youden
|
|
7
11
|
from .calculate_read_length_stats import calculate_read_length_stats
|
|
8
12
|
from .clean_NaN import clean_NaN
|
|
9
13
|
from .filter_adata_by_nan_proportion import filter_adata_by_nan_proportion
|
|
10
|
-
from .
|
|
11
|
-
from .
|
|
14
|
+
from .filter_reads_on_modification_thresholds import filter_reads_on_modification_thresholds
|
|
15
|
+
from .filter_reads_on_length_quality_mapping import filter_reads_on_length_quality_mapping
|
|
12
16
|
from .invert_adata import invert_adata
|
|
13
17
|
from .load_sample_sheet import load_sample_sheet
|
|
14
18
|
from .flag_duplicate_reads import flag_duplicate_reads
|
|
15
19
|
from .subsample_adata import subsample_adata
|
|
16
20
|
|
|
17
21
|
__all__ = [
|
|
18
|
-
"
|
|
22
|
+
"add_read_length_and_mapping_qc",
|
|
23
|
+
"append_base_context",
|
|
24
|
+
"append_binary_layer_by_base_context",
|
|
19
25
|
"binarize_on_Youden",
|
|
26
|
+
"binarize_adata",
|
|
20
27
|
"calculate_complexity",
|
|
21
|
-
"
|
|
28
|
+
"calculate_read_modification_stats",
|
|
22
29
|
"calculate_coverage",
|
|
23
30
|
"calculate_position_Youden",
|
|
24
31
|
"calculate_read_length_stats",
|
|
25
32
|
"clean_NaN",
|
|
26
33
|
"filter_adata_by_nan_proportion",
|
|
27
|
-
"
|
|
28
|
-
"
|
|
34
|
+
"filter_reads_on_modification_thresholds",
|
|
35
|
+
"filter_reads_on_length_quality_mapping",
|
|
29
36
|
"invert_adata",
|
|
30
37
|
"load_sample_sheet",
|
|
31
38
|
"flag_duplicate_reads",
|