smftools 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. smftools/__init__.py +7 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/cli_flows.py +94 -0
  4. smftools/cli/hmm_adata.py +338 -0
  5. smftools/cli/load_adata.py +577 -0
  6. smftools/cli/preprocess_adata.py +363 -0
  7. smftools/cli/spatial_adata.py +564 -0
  8. smftools/cli_entry.py +435 -0
  9. smftools/config/__init__.py +1 -0
  10. smftools/config/conversion.yaml +38 -0
  11. smftools/config/deaminase.yaml +61 -0
  12. smftools/config/default.yaml +264 -0
  13. smftools/config/direct.yaml +41 -0
  14. smftools/config/discover_input_files.py +115 -0
  15. smftools/config/experiment_config.py +1288 -0
  16. smftools/hmm/HMM.py +1576 -0
  17. smftools/hmm/__init__.py +20 -0
  18. smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
  19. smftools/hmm/call_hmm_peaks.py +106 -0
  20. smftools/{tools → hmm}/display_hmm.py +3 -3
  21. smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
  22. smftools/{tools → hmm}/train_hmm.py +1 -1
  23. smftools/informatics/__init__.py +13 -9
  24. smftools/informatics/archived/deaminase_smf.py +132 -0
  25. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  26. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  27. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  28. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
  29. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  30. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  31. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  32. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  33. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  34. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +30 -4
  35. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  36. smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
  37. smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +5 -4
  38. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  39. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  40. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  41. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  42. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  43. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
  44. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +8 -7
  45. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +8 -12
  46. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  47. smftools/informatics/bam_functions.py +812 -0
  48. smftools/informatics/basecalling.py +67 -0
  49. smftools/informatics/bed_functions.py +366 -0
  50. smftools/informatics/binarize_converted_base_identities.py +172 -0
  51. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +198 -50
  52. smftools/informatics/fasta_functions.py +255 -0
  53. smftools/informatics/h5ad_functions.py +197 -0
  54. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +147 -61
  55. smftools/informatics/modkit_functions.py +129 -0
  56. smftools/informatics/ohe.py +160 -0
  57. smftools/informatics/pod5_functions.py +224 -0
  58. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  59. smftools/machine_learning/__init__.py +12 -0
  60. smftools/machine_learning/data/__init__.py +2 -0
  61. smftools/machine_learning/data/anndata_data_module.py +234 -0
  62. smftools/machine_learning/evaluation/__init__.py +2 -0
  63. smftools/machine_learning/evaluation/eval_utils.py +31 -0
  64. smftools/machine_learning/evaluation/evaluators.py +223 -0
  65. smftools/machine_learning/inference/__init__.py +3 -0
  66. smftools/machine_learning/inference/inference_utils.py +27 -0
  67. smftools/machine_learning/inference/lightning_inference.py +68 -0
  68. smftools/machine_learning/inference/sklearn_inference.py +55 -0
  69. smftools/machine_learning/inference/sliding_window_inference.py +114 -0
  70. smftools/machine_learning/models/base.py +295 -0
  71. smftools/machine_learning/models/cnn.py +138 -0
  72. smftools/machine_learning/models/lightning_base.py +345 -0
  73. smftools/machine_learning/models/mlp.py +26 -0
  74. smftools/{tools → machine_learning}/models/positional.py +3 -2
  75. smftools/{tools → machine_learning}/models/rnn.py +2 -1
  76. smftools/machine_learning/models/sklearn_models.py +273 -0
  77. smftools/machine_learning/models/transformer.py +303 -0
  78. smftools/machine_learning/training/__init__.py +2 -0
  79. smftools/machine_learning/training/train_lightning_model.py +135 -0
  80. smftools/machine_learning/training/train_sklearn_model.py +114 -0
  81. smftools/plotting/__init__.py +4 -1
  82. smftools/plotting/autocorrelation_plotting.py +609 -0
  83. smftools/plotting/general_plotting.py +1292 -140
  84. smftools/plotting/hmm_plotting.py +260 -0
  85. smftools/plotting/qc_plotting.py +270 -0
  86. smftools/preprocessing/__init__.py +15 -8
  87. smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
  88. smftools/preprocessing/append_base_context.py +122 -0
  89. smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
  90. smftools/preprocessing/binarize.py +17 -0
  91. smftools/preprocessing/binarize_on_Youden.py +2 -2
  92. smftools/preprocessing/calculate_complexity_II.py +248 -0
  93. smftools/preprocessing/calculate_coverage.py +10 -1
  94. smftools/preprocessing/calculate_position_Youden.py +1 -1
  95. smftools/preprocessing/calculate_read_modification_stats.py +101 -0
  96. smftools/preprocessing/clean_NaN.py +17 -1
  97. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
  98. smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
  99. smftools/preprocessing/flag_duplicate_reads.py +1326 -124
  100. smftools/preprocessing/invert_adata.py +12 -5
  101. smftools/preprocessing/load_sample_sheet.py +19 -4
  102. smftools/readwrite.py +1021 -89
  103. smftools/tools/__init__.py +3 -32
  104. smftools/tools/calculate_umap.py +5 -5
  105. smftools/tools/general_tools.py +3 -3
  106. smftools/tools/position_stats.py +468 -106
  107. smftools/tools/read_stats.py +115 -1
  108. smftools/tools/spatial_autocorrelation.py +562 -0
  109. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/METADATA +14 -9
  110. smftools-0.2.3.dist-info/RECORD +173 -0
  111. smftools-0.2.3.dist-info/entry_points.txt +2 -0
  112. smftools/informatics/fast5_to_pod5.py +0 -21
  113. smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
  114. smftools/informatics/helpers/__init__.py +0 -74
  115. smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
  116. smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
  117. smftools/informatics/helpers/bam_qc.py +0 -66
  118. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  119. smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
  120. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
  121. smftools/informatics/helpers/index_fasta.py +0 -12
  122. smftools/informatics/helpers/make_dirs.py +0 -21
  123. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
  124. smftools/informatics/load_adata.py +0 -182
  125. smftools/informatics/readwrite.py +0 -106
  126. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  127. smftools/preprocessing/append_C_context.py +0 -82
  128. smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
  129. smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
  130. smftools/preprocessing/filter_reads_on_length.py +0 -51
  131. smftools/tools/call_hmm_peaks.py +0 -105
  132. smftools/tools/data/__init__.py +0 -2
  133. smftools/tools/data/anndata_data_module.py +0 -90
  134. smftools/tools/inference/__init__.py +0 -1
  135. smftools/tools/inference/lightning_inference.py +0 -41
  136. smftools/tools/models/base.py +0 -14
  137. smftools/tools/models/cnn.py +0 -34
  138. smftools/tools/models/lightning_base.py +0 -41
  139. smftools/tools/models/mlp.py +0 -17
  140. smftools/tools/models/sklearn_models.py +0 -40
  141. smftools/tools/models/transformer.py +0 -133
  142. smftools/tools/training/__init__.py +0 -1
  143. smftools/tools/training/train_lightning_model.py +0 -47
  144. smftools-0.1.7.dist-info/RECORD +0 -136
  145. /smftools/{tools/evaluation → cli}/__init__.py +0 -0
  146. /smftools/{tools → hmm}/calculate_distances.py +0 -0
  147. /smftools/{tools → hmm}/hmm_readwrite.py +0 -0
  148. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  149. /smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
  150. /smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
  151. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  152. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  153. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  154. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  155. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  156. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  157. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  158. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  159. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  160. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  161. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  162. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  163. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  164. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  165. /smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
  166. /smftools/{tools → machine_learning}/models/__init__.py +0 -0
  167. /smftools/{tools → machine_learning}/models/wrappers.py +0 -0
  168. /smftools/{tools → machine_learning}/utils/__init__.py +0 -0
  169. /smftools/{tools → machine_learning}/utils/device.py +0 -0
  170. /smftools/{tools → machine_learning}/utils/grl.py +0 -0
  171. /smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
  172. /smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
  173. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
  174. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,260 @@
1
+ import math
2
+ from typing import List, Optional, Tuple, Union
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ from matplotlib.backends.backend_pdf import PdfPages
6
+
7
+ def plot_hmm_size_contours(
8
+ adata,
9
+ length_layer: str,
10
+ sample_col: str,
11
+ ref_obs_col: str,
12
+ rows_per_page: int = 4,
13
+ max_length_cap: Optional[int] = 1000,
14
+ figsize_per_cell: Tuple[float, float] = (4.0, 2.5),
15
+ cmap: str = "viridis",
16
+ log_scale_z: bool = False,
17
+ save_path: Optional[str] = None,
18
+ save_pdf: bool = True,
19
+ save_each_page: bool = False,
20
+ dpi: int = 150,
21
+ vmin: Optional[float] = None,
22
+ vmax: Optional[float] = None,
23
+ # ---------------- smoothing params ----------------
24
+ smoothing_sigma: Optional[Union[float, Tuple[float, float]]] = None,
25
+ normalize_after_smoothing: bool = True,
26
+ use_scipy_if_available: bool = True,
27
+ ):
28
+ """
29
+ Create contour/pcolormesh plots of P(length | position) using a length-encoded HMM layer.
30
+ Optional Gaussian smoothing applied to the 2D probability grid before plotting.
31
+
32
+ smoothing_sigma: None or 0 -> no smoothing.
33
+ float -> same sigma applied to (length_axis, position_axis)
34
+ (sigma_len, sigma_pos) -> separate sigmas.
35
+ normalize_after_smoothing: if True, renormalize each position-column to sum to 1 after smoothing.
36
+
37
+ Other args are the same as prior function.
38
+ """
39
+ # --- helper: gaussian smoothing (scipy fallback -> numpy separable conv) ---
40
+ def _gaussian_1d_kernel(sigma: float, eps: float = 1e-12):
41
+ if sigma <= 0 or sigma is None:
42
+ return np.array([1.0], dtype=float)
43
+ # choose kernel size = odd ~ 6*sigma (covers +/-3 sigma)
44
+ radius = max(1, int(math.ceil(3.0 * float(sigma))))
45
+ xs = np.arange(-radius, radius + 1, dtype=float)
46
+ k = np.exp(-(xs ** 2) / (2.0 * sigma ** 2))
47
+ k_sum = k.sum()
48
+ if k_sum <= eps:
49
+ k = np.array([1.0], dtype=float)
50
+ k_sum = 1.0
51
+ return k / k_sum
52
+
53
+ def _smooth_with_numpy_separable(Z: np.ndarray, sigma_len: float, sigma_pos: float) -> np.ndarray:
54
+ # Z shape: (n_lengths, n_positions)
55
+ out = Z.copy()
56
+ # smooth along length axis (axis=0)
57
+ if sigma_len and sigma_len > 0:
58
+ k_len = _gaussian_1d_kernel(sigma_len)
59
+ # convolve each column
60
+ out = np.apply_along_axis(lambda col: np.convolve(col, k_len, mode="same"), axis=0, arr=out)
61
+ # smooth along position axis (axis=1)
62
+ if sigma_pos and sigma_pos > 0:
63
+ k_pos = _gaussian_1d_kernel(sigma_pos)
64
+ out = np.apply_along_axis(lambda row: np.convolve(row, k_pos, mode="same"), axis=1, arr=out)
65
+ return out
66
+
67
+ # prefer scipy.ndimage if available (faster and better boundary handling)
68
+ _have_scipy = False
69
+ if use_scipy_if_available:
70
+ try:
71
+ from scipy.ndimage import gaussian_filter as _scipy_gaussian_filter
72
+ _have_scipy = True
73
+ except Exception:
74
+ _have_scipy = False
75
+
76
+ def _smooth_Z(Z: np.ndarray, sigma_len: float, sigma_pos: float) -> np.ndarray:
77
+ if (sigma_len is None or sigma_len == 0) and (sigma_pos is None or sigma_pos == 0):
78
+ return Z
79
+ if _have_scipy:
80
+ # scipy expects sigma sequence in axis order (axis=0 length, axis=1 pos)
81
+ sigma_seq = (float(sigma_len or 0.0), float(sigma_pos or 0.0))
82
+ return _scipy_gaussian_filter(Z, sigma=sigma_seq, mode="reflect")
83
+ else:
84
+ return _smooth_with_numpy_separable(Z, float(sigma_len or 0.0), float(sigma_pos or 0.0))
85
+
86
+ # --- gather unique ordered labels ---
87
+ samples = list(adata.obs[sample_col].cat.categories) if getattr(adata.obs[sample_col], "dtype", None) == "category" else list(pd.Categorical(adata.obs[sample_col]).categories)
88
+ refs = list(adata.obs[ref_obs_col].cat.categories) if getattr(adata.obs[ref_obs_col], "dtype", None) == "category" else list(pd.Categorical(adata.obs[ref_obs_col]).categories)
89
+
90
+ n_samples = len(samples)
91
+ n_refs = len(refs)
92
+ if n_samples == 0 or n_refs == 0:
93
+ raise ValueError("No samples or references found for plotting.")
94
+
95
+ # Try to get numeric coordinates for x axis; fallback to range indices
96
+ try:
97
+ coords = np.asarray(adata.var_names, dtype=int)
98
+ x_ticks_is_positions = True
99
+ except Exception:
100
+ coords = np.arange(adata.shape[1], dtype=int)
101
+ x_ticks_is_positions = False
102
+
103
+ # helper to get dense layer array for subset
104
+ def _get_layer_array(layer):
105
+ arr = layer
106
+ # sparse -> toarray
107
+ if hasattr(arr, "toarray"):
108
+ arr = arr.toarray()
109
+ return np.asarray(arr)
110
+
111
+ # fetch the whole layer once (not necessary but helps)
112
+ if length_layer not in adata.layers:
113
+ raise KeyError(f"Layer {length_layer} not found in adata.layers")
114
+ full_layer = _get_layer_array(adata.layers[length_layer]) # shape (n_obs, n_vars)
115
+
116
+ # Precompute pages
117
+ pages = math.ceil(n_samples / rows_per_page)
118
+ figs = []
119
+
120
+ # decide global max length to allocate y axis (cap to avoid huge memory)
121
+ observed_max_len = int(np.max(full_layer)) if full_layer.size > 0 else 0
122
+ if max_length_cap is None:
123
+ max_len = observed_max_len
124
+ else:
125
+ max_len = min(int(max_length_cap), max(1, observed_max_len))
126
+ if max_len < 1:
127
+ max_len = 1
128
+
129
+ # parse smoothing_sigma
130
+ if smoothing_sigma is None or smoothing_sigma == 0:
131
+ sigma_len, sigma_pos = 0.0, 0.0
132
+ elif isinstance(smoothing_sigma, (int, float)):
133
+ sigma_len = float(smoothing_sigma)
134
+ sigma_pos = float(smoothing_sigma)
135
+ else:
136
+ sigma_len = float(smoothing_sigma[0])
137
+ sigma_pos = float(smoothing_sigma[1])
138
+
139
+ # iterate pages
140
+ for p in range(pages):
141
+ start_sample = p * rows_per_page
142
+ end_sample = min(n_samples, (p + 1) * rows_per_page)
143
+ page_samples = samples[start_sample:end_sample]
144
+ rows_on_page = len(page_samples)
145
+
146
+ fig_w = n_refs * figsize_per_cell[0]
147
+ fig_h = rows_on_page * figsize_per_cell[1]
148
+ fig, axes = plt.subplots(rows_on_page, n_refs, figsize=(fig_w, fig_h), squeeze=False)
149
+ fig.suptitle(f"HMM size contours (page {p+1}/{pages})", fontsize=12)
150
+
151
+ # for each panel compute p(length | position)
152
+ for i_row, sample in enumerate(page_samples):
153
+ for j_col, ref in enumerate(refs):
154
+ ax = axes[i_row][j_col]
155
+ panel_mask = (adata.obs[sample_col] == sample) & (adata.obs[ref_obs_col] == ref)
156
+ if not panel_mask.any():
157
+ ax.text(0.5, 0.5, "no reads", ha="center", va="center")
158
+ ax.set_xticks([])
159
+ ax.set_yticks([])
160
+ ax.set_title(f"{sample} / {ref}")
161
+ continue
162
+
163
+ row_idx = np.nonzero(panel_mask.values if hasattr(panel_mask, "values") else np.asarray(panel_mask))[0]
164
+ if row_idx.size == 0:
165
+ ax.text(0.5, 0.5, "no reads", ha="center", va="center")
166
+ ax.set_title(f"{sample} / {ref}")
167
+ continue
168
+
169
+ sub = full_layer[row_idx, :] # (n_reads, n_positions)
170
+ if sub.size == 0:
171
+ ax.text(0.5, 0.5, "no data", ha="center", va="center")
172
+ ax.set_title(f"{sample} / {ref}")
173
+ continue
174
+
175
+ # compute counts per length per position
176
+ n_positions = sub.shape[1]
177
+ max_len_local = int(sub.max()) if sub.size > 0 else 0
178
+ max_len_here = min(max_len, max_len_local)
179
+
180
+ lengths_range = np.arange(1, max_len_here + 1, dtype=int)
181
+ Z = np.zeros((len(lengths_range), n_positions), dtype=float) # rows=length, cols=pos
182
+
183
+ # fill Z by efficient bincount across columns
184
+ for j in range(n_positions):
185
+ col_vals = sub[:, j]
186
+ pos_vals = col_vals[col_vals > 0].astype(int)
187
+ if pos_vals.size == 0:
188
+ continue
189
+ clipped = np.clip(pos_vals, 1, max_len_here)
190
+ counts = np.bincount(clipped, minlength=max_len_here + 1)[1:]
191
+ s = counts.sum()
192
+ if s > 0:
193
+ Z[:, j] = counts.astype(float) # keep counts for smoothing
194
+
195
+ # normalize per-column -> p(length | pos) BEFORE smoothing OR AFTER
196
+ # We'll smooth counts and then optionally renormalize (normalize_after_smoothing controls)
197
+ # Apply smoothing to Z (counts)
198
+ if sigma_len > 0 or sigma_pos > 0:
199
+ Z = _smooth_Z(Z, sigma_len, sigma_pos)
200
+
201
+ # normalize to conditional probability per column
202
+ if normalize_after_smoothing:
203
+ col_sums = Z.sum(axis=0, keepdims=True)
204
+ # avoid divide-by-zero
205
+ col_sums[col_sums == 0] = 1.0
206
+ Z = Z / col_sums
207
+
208
+ if log_scale_z:
209
+ Z_plot = np.log1p(Z)
210
+ else:
211
+ Z_plot = Z
212
+
213
+ # Build x and y grids for pcolormesh: x = coords (positions)
214
+ x = coords[:n_positions]
215
+ if n_positions >= 2:
216
+ dx = np.diff(x).mean()
217
+ x_edges = np.concatenate([x - dx / 2.0, [x[-1] + dx / 2.0]])
218
+ else:
219
+ x_edges = np.array([x[0] - 0.5, x[0] + 0.5])
220
+
221
+ y = lengths_range
222
+ dy = 1.0
223
+ y_edges = np.concatenate([y - 0.5, [y[-1] + 0.5]])
224
+
225
+ pcm = ax.pcolormesh(x_edges, y_edges, Z_plot, cmap=cmap, shading="auto", vmin=vmin, vmax=vmax)
226
+ ax.set_title(f"{sample} / {ref}")
227
+ ax.set_ylabel("length")
228
+ if i_row == rows_on_page - 1:
229
+ ax.set_xlabel("position")
230
+ else:
231
+ ax.set_xticklabels([])
232
+
233
+ # colorbar
234
+ fig.subplots_adjust(right=0.88)
235
+ cax = fig.add_axes([0.9, 0.15, 0.02, 0.7])
236
+ try:
237
+ fig.colorbar(pcm, cax=cax)
238
+ except Exception:
239
+ pass
240
+
241
+ figs.append(fig)
242
+
243
+ # saving per page if requested
244
+ if save_path is not None:
245
+ import os
246
+ os.makedirs(save_path, exist_ok=True)
247
+ if save_each_page:
248
+ fname = f"hmm_size_page_{p+1:03d}.png"
249
+ out = os.path.join(save_path, fname)
250
+ fig.savefig(out, dpi=dpi, bbox_inches="tight")
251
+
252
+ # multipage PDF if requested
253
+ if save_path is not None and save_pdf:
254
+ pdf_file = os.path.join(save_path, "hmm_size_contours_pages.pdf")
255
+ with PdfPages(pdf_file) as pp:
256
+ for fig in figs:
257
+ pp.savefig(fig, bbox_inches="tight")
258
+ print(f"Saved multipage PDF: {pdf_file}")
259
+
260
+ return figs
@@ -0,0 +1,270 @@
1
+ import os
2
+ import numpy as np
3
+ import pandas as pd
4
+ import matplotlib.pyplot as plt
5
+
6
+ import os
7
+ import numpy as np
8
+ import pandas as pd
9
+ import matplotlib.pyplot as plt
10
+
11
+
12
+ def plot_read_qc_histograms(
13
+ adata,
14
+ outdir,
15
+ obs_keys,
16
+ sample_key,
17
+ bins=60,
18
+ clip_quantiles=(0.0, 0.995),
19
+ min_non_nan=10,
20
+ rows_per_fig=6,
21
+ topn_categories=15,
22
+ figsize_cell=(3.6, 2.6),
23
+ dpi=150,
24
+ ):
25
+ """
26
+ Plot a grid of QC histograms: rows = samples (from `sample_key`), columns = `obs_keys`.
27
+
28
+ Numeric columns -> histogram per sample.
29
+ Categorical columns -> bar chart of top categories per sample.
30
+
31
+ Saves paginated PNGs to `outdir`.
32
+
33
+ Parameters
34
+ ----------
35
+ adata : AnnData
36
+ outdir : str
37
+ obs_keys : list[str]
38
+ sample_key : str
39
+ Column in adata.obs defining rows (samples/barcodes).
40
+ bins : int
41
+ Histogram bins for numeric metrics.
42
+ clip_quantiles : tuple or None
43
+ Clip numeric data globally per metric for consistent axes, e.g. (0.0, 0.995).
44
+ min_non_nan : int
45
+ Minimum finite values to plot a panel.
46
+ rows_per_fig : int
47
+ Number of samples per page.
48
+ topn_categories : int
49
+ For categorical metrics, show top-N categories (per sample).
50
+ figsize_cell : (float, float)
51
+ Size of each subplot cell (width, height).
52
+ dpi : int
53
+ Figure resolution.
54
+ """
55
+ os.makedirs(outdir, exist_ok=True)
56
+
57
+ if sample_key not in adata.obs.columns:
58
+ raise KeyError(f"'{sample_key}' not found in adata.obs")
59
+
60
+ # Ensure sample_key is categorical for stable ordering
61
+ samples = adata.obs[sample_key]
62
+ if not pd.api.types.is_categorical_dtype(samples):
63
+ samples = samples.astype("category")
64
+ sample_levels = list(samples.cat.categories)
65
+
66
+ # Validate keys, and classify numeric vs categorical
67
+ valid_keys = []
68
+ is_numeric = {}
69
+ for key in obs_keys:
70
+ if key not in adata.obs.columns:
71
+ print(f"[WARN] '{key}' not found in obs; skipping.")
72
+ continue
73
+ s = adata.obs[key]
74
+ num = pd.api.types.is_numeric_dtype(s)
75
+ valid_keys.append(key)
76
+ is_numeric[key] = num
77
+ if not valid_keys:
78
+ print("[plot_read_qc_grid] No valid obs_keys to plot.")
79
+ return
80
+
81
+ # Precompute global numeric ranges (after clipping) so rows share x-axis per column
82
+ global_ranges = {}
83
+ for key in valid_keys:
84
+ if not is_numeric[key]:
85
+ continue
86
+ s = pd.to_numeric(adata.obs[key], errors="coerce").replace([np.inf, -np.inf], np.nan).dropna()
87
+ if s.size < min_non_nan:
88
+ # still set something to avoid errors; just use min/max or (0,1)
89
+ lo, hi = (0.0, 1.0) if s.size == 0 else (float(s.min()), float(s.max()))
90
+ else:
91
+ if clip_quantiles:
92
+ qlo = s.quantile(clip_quantiles[0]) if clip_quantiles[0] is not None else s.min()
93
+ qhi = s.quantile(clip_quantiles[1]) if clip_quantiles[1] is not None else s.max()
94
+ lo, hi = float(qlo), float(qhi)
95
+ if not (np.isfinite(lo) and np.isfinite(hi) and hi > lo):
96
+ lo, hi = float(s.min()), float(s.max())
97
+ else:
98
+ lo, hi = float(s.min()), float(s.max())
99
+ global_ranges[key] = (lo, hi)
100
+
101
+ def _sanitize(name: str) -> str:
102
+ return "".join(c if c.isalnum() or c in "-._" else "_" for c in str(name))
103
+
104
+ ncols = len(valid_keys)
105
+ fig_w = figsize_cell[0] * ncols
106
+ # rows per page is rows_per_fig; figure height scales accordingly
107
+ fig_h_unit = figsize_cell[1]
108
+
109
+ for start in range(0, len(sample_levels), rows_per_fig):
110
+ chunk = sample_levels[start:start + rows_per_fig]
111
+ nrows = len(chunk)
112
+ fig, axes = plt.subplots(
113
+ nrows=nrows, ncols=ncols,
114
+ figsize=(fig_w, fig_h_unit * nrows),
115
+ dpi=dpi,
116
+ squeeze=False,
117
+ )
118
+
119
+ for r, sample_val in enumerate(chunk):
120
+ row_mask = (adata.obs[sample_key].values == sample_val)
121
+ n_in_row = int(row_mask.sum())
122
+
123
+ for c, key in enumerate(valid_keys):
124
+ ax = axes[r, c]
125
+ series = adata.obs.loc[row_mask, key]
126
+
127
+ if is_numeric[key]:
128
+ x = pd.to_numeric(series, errors="coerce").replace([np.inf, -np.inf], np.nan).dropna()
129
+ if x.size < min_non_nan:
130
+ ax.text(0.5, 0.5, f"n={x.size} (<{min_non_nan})", ha="center", va="center")
131
+ else:
132
+ # clip to global range for consistent axes
133
+ lo, hi = global_ranges[key]
134
+ x = x.clip(lo, hi)
135
+ ax.hist(x.values, bins=bins, range=(lo, hi), edgecolor="black", alpha=0.7)
136
+ ax.set_xlim(lo, hi)
137
+ if r == 0:
138
+ ax.set_title(key)
139
+ if c == 0:
140
+ ax.set_ylabel(f"{sample_val}\n(n={n_in_row})")
141
+ ax.grid(alpha=0.25)
142
+ ax.set_xlabel("") # keep uncluttered; x-limit conveys scale
143
+ else:
144
+ vc = series.astype("category").value_counts(dropna=False)
145
+ if vc.sum() < min_non_nan:
146
+ ax.text(0.5, 0.5, f"n={vc.sum()} (<{min_non_nan})", ha="center", va="center")
147
+ else:
148
+ vc_top = vc.iloc[:topn_categories][::-1] # show top-N, reversed for barh
149
+ ax.barh(vc_top.index.astype(str), vc_top.values)
150
+ ax.invert_yaxis()
151
+ if r == 0:
152
+ ax.set_title(f"{key} (cat)")
153
+ if c == 0:
154
+ ax.set_ylabel(f"{sample_val}\n(n={n_in_row})")
155
+ ax.grid(alpha=0.25)
156
+ # trim labels to reduce clutter
157
+ if vc.sum() >= min_non_nan:
158
+ ax.tick_params(axis="y", labelsize=8)
159
+
160
+ plt.tight_layout()
161
+ page = start // rows_per_fig + 1
162
+ out_png = os.path.join(outdir, f"qc_grid_{_sanitize(sample_key)}_page{page}.png")
163
+ plt.savefig(out_png, bbox_inches="tight")
164
+ plt.close(fig)
165
+
166
+
167
+ # def plot_read_qc_histograms(
168
+ # adata,
169
+ # outdir,
170
+ # obs_keys,
171
+ # sample_key=None,
172
+ # *,
173
+ # bins=100,
174
+ # clip_quantiles=(0.0, 0.995),
175
+ # min_non_nan=10,
176
+ # figsize=(6, 4),
177
+ # dpi=150
178
+ # ):
179
+ # """
180
+ # Plots histograms for given obs_keys, optionally grouped by sample_key.
181
+
182
+ # Parameters
183
+ # ----------
184
+ # adata : AnnData
185
+ # AnnData object.
186
+ # outdir : str
187
+ # Output directory for PNG files.
188
+ # obs_keys : list[str]
189
+ # List of obs columns to plot.
190
+ # sample_key : str or None
191
+ # Column in adata.obs to group by (e.g., 'Barcode').
192
+ # If None, plots are for the full dataset only.
193
+ # bins : int
194
+ # Number of histogram bins for numeric data.
195
+ # clip_quantiles : tuple or None
196
+ # (low_q, high_q) to clip extreme values for plotting.
197
+ # min_non_nan : int
198
+ # Minimum number of finite values to plot.
199
+ # figsize : tuple
200
+ # Figure size.
201
+ # dpi : int
202
+ # Figure resolution.
203
+ # """
204
+ # os.makedirs(outdir, exist_ok=True)
205
+
206
+ # # Define grouping
207
+ # if sample_key and sample_key in adata.obs.columns:
208
+ # groups = adata.obs.groupby(sample_key)
209
+ # else:
210
+ # groups = [(None, adata.obs)] # single group
211
+
212
+ # for group_name, group_df in groups:
213
+ # # For each metric
214
+ # for key in obs_keys:
215
+ # if key not in group_df.columns:
216
+ # print(f"[WARN] '{key}' not found in obs; skipping.")
217
+ # continue
218
+
219
+ # series = group_df[key]
220
+
221
+ # # Numeric columns
222
+ # if pd.api.types.is_numeric_dtype(series):
223
+ # x = pd.to_numeric(series, errors="coerce").replace([np.inf, -np.inf], np.nan).dropna()
224
+ # if len(x) < min_non_nan:
225
+ # continue
226
+
227
+ # # Clip for better visualization
228
+ # if clip_quantiles:
229
+ # lo = x.quantile(clip_quantiles[0]) if clip_quantiles[0] is not None else x.min()
230
+ # hi = x.quantile(clip_quantiles[1]) if clip_quantiles[1] is not None else x.max()
231
+ # if np.isfinite(lo) and np.isfinite(hi) and hi > lo:
232
+ # x = x.clip(lo, hi)
233
+
234
+ # fig, ax = plt.subplots(figsize=figsize, dpi=dpi)
235
+ # ax.hist(x, bins=bins, edgecolor="black", alpha=0.7)
236
+ # ax.set_xlabel(key)
237
+ # ax.set_ylabel("Count")
238
+
239
+ # title = f"{key}" if group_name is None else f"{key} — {sample_key}={group_name}"
240
+ # ax.set_title(title)
241
+
242
+ # plt.tight_layout()
243
+
244
+ # # Save PNG
245
+ # safe_group = "all" if group_name is None else str(group_name)
246
+ # fname = f"{key}_{sample_key}_{safe_group}.png" if sample_key else f"{key}.png"
247
+ # fname = fname.replace("/", "_")
248
+ # fig.savefig(os.path.join(outdir, fname))
249
+ # plt.close(fig)
250
+
251
+ # else:
252
+ # # Categorical columns
253
+ # vc = series.astype("category").value_counts(dropna=False)
254
+ # if vc.sum() < min_non_nan:
255
+ # continue
256
+
257
+ # fig, ax = plt.subplots(figsize=figsize, dpi=dpi)
258
+ # vc.plot(kind="barh", ax=ax)
259
+ # ax.set_xlabel("Count")
260
+
261
+ # title = f"{key} (categorical)" if group_name is None else f"{key} — {sample_key}={group_name}"
262
+ # ax.set_title(title)
263
+
264
+ # plt.tight_layout()
265
+
266
+ # safe_group = "all" if group_name is None else str(group_name)
267
+ # fname = f"{key}_{sample_key}_{safe_group}.png" if sample_key else f"{key}.png"
268
+ # fname = fname.replace("/", "_")
269
+ # fig.savefig(os.path.join(outdir, fname))
270
+ # plt.close(fig)
@@ -1,31 +1,38 @@
1
- from .append_C_context import append_C_context
1
+ from .add_read_length_and_mapping_qc import add_read_length_and_mapping_qc
2
+ from .append_base_context import append_base_context
3
+ from .append_binary_layer_by_base_context import append_binary_layer_by_base_context
2
4
  from .binarize_on_Youden import binarize_on_Youden
5
+ from .binarize import binarize_adata
3
6
  from .calculate_complexity import calculate_complexity
4
- from .calculate_converted_read_methylation_stats import calculate_converted_read_methylation_stats
7
+ from .calculate_complexity_II import calculate_complexity_II
8
+ from .calculate_read_modification_stats import calculate_read_modification_stats
5
9
  from .calculate_coverage import calculate_coverage
6
10
  from .calculate_position_Youden import calculate_position_Youden
7
11
  from .calculate_read_length_stats import calculate_read_length_stats
8
12
  from .clean_NaN import clean_NaN
9
13
  from .filter_adata_by_nan_proportion import filter_adata_by_nan_proportion
10
- from .filter_converted_reads_on_methylation import filter_converted_reads_on_methylation
11
- from .filter_reads_on_length import filter_reads_on_length
14
+ from .filter_reads_on_modification_thresholds import filter_reads_on_modification_thresholds
15
+ from .filter_reads_on_length_quality_mapping import filter_reads_on_length_quality_mapping
12
16
  from .invert_adata import invert_adata
13
17
  from .load_sample_sheet import load_sample_sheet
14
18
  from .flag_duplicate_reads import flag_duplicate_reads
15
19
  from .subsample_adata import subsample_adata
16
20
 
17
21
  __all__ = [
18
- "append_C_context",
22
+ "add_read_length_and_mapping_qc",
23
+ "append_base_context",
24
+ "append_binary_layer_by_base_context",
19
25
  "binarize_on_Youden",
26
+ "binarize_adata",
20
27
  "calculate_complexity",
21
- "calculate_converted_read_methylation_stats",
28
+ "calculate_read_modification_stats",
22
29
  "calculate_coverage",
23
30
  "calculate_position_Youden",
24
31
  "calculate_read_length_stats",
25
32
  "clean_NaN",
26
33
  "filter_adata_by_nan_proportion",
27
- "filter_converted_reads_on_methylation",
28
- "filter_reads_on_length",
34
+ "filter_reads_on_modification_thresholds",
35
+ "filter_reads_on_length_quality_mapping",
29
36
  "invert_adata",
30
37
  "load_sample_sheet",
31
38
  "flag_duplicate_reads",