smftools 0.1.6__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. smftools/__init__.py +34 -0
  2. smftools/_settings.py +20 -0
  3. smftools/_version.py +1 -0
  4. smftools/cli.py +184 -0
  5. smftools/config/__init__.py +1 -0
  6. smftools/config/conversion.yaml +33 -0
  7. smftools/config/deaminase.yaml +56 -0
  8. smftools/config/default.yaml +253 -0
  9. smftools/config/direct.yaml +17 -0
  10. smftools/config/experiment_config.py +1191 -0
  11. smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
  12. smftools/datasets/F1_sample_sheet.csv +5 -0
  13. smftools/datasets/__init__.py +9 -0
  14. smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
  15. smftools/datasets/datasets.py +28 -0
  16. smftools/hmm/HMM.py +1576 -0
  17. smftools/hmm/__init__.py +20 -0
  18. smftools/hmm/apply_hmm_batched.py +242 -0
  19. smftools/hmm/calculate_distances.py +18 -0
  20. smftools/hmm/call_hmm_peaks.py +106 -0
  21. smftools/hmm/display_hmm.py +18 -0
  22. smftools/hmm/hmm_readwrite.py +16 -0
  23. smftools/hmm/nucleosome_hmm_refinement.py +104 -0
  24. smftools/hmm/train_hmm.py +78 -0
  25. smftools/informatics/__init__.py +14 -0
  26. smftools/informatics/archived/bam_conversion.py +59 -0
  27. smftools/informatics/archived/bam_direct.py +63 -0
  28. smftools/informatics/archived/basecalls_to_adata.py +71 -0
  29. smftools/informatics/archived/conversion_smf.py +132 -0
  30. smftools/informatics/archived/deaminase_smf.py +132 -0
  31. smftools/informatics/archived/direct_smf.py +137 -0
  32. smftools/informatics/archived/print_bam_query_seq.py +29 -0
  33. smftools/informatics/basecall_pod5s.py +80 -0
  34. smftools/informatics/fast5_to_pod5.py +24 -0
  35. smftools/informatics/helpers/__init__.py +73 -0
  36. smftools/informatics/helpers/align_and_sort_BAM.py +86 -0
  37. smftools/informatics/helpers/aligned_BAM_to_bed.py +85 -0
  38. smftools/informatics/helpers/archived/informatics.py +260 -0
  39. smftools/informatics/helpers/archived/load_adata.py +516 -0
  40. smftools/informatics/helpers/bam_qc.py +66 -0
  41. smftools/informatics/helpers/bed_to_bigwig.py +39 -0
  42. smftools/informatics/helpers/binarize_converted_base_identities.py +172 -0
  43. smftools/informatics/helpers/canoncall.py +34 -0
  44. smftools/informatics/helpers/complement_base_list.py +21 -0
  45. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +378 -0
  46. smftools/informatics/helpers/converted_BAM_to_adata.py +245 -0
  47. smftools/informatics/helpers/converted_BAM_to_adata_II.py +505 -0
  48. smftools/informatics/helpers/count_aligned_reads.py +43 -0
  49. smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
  50. smftools/informatics/helpers/discover_input_files.py +100 -0
  51. smftools/informatics/helpers/extract_base_identities.py +70 -0
  52. smftools/informatics/helpers/extract_mods.py +83 -0
  53. smftools/informatics/helpers/extract_read_features_from_bam.py +33 -0
  54. smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
  55. smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
  56. smftools/informatics/helpers/find_conversion_sites.py +51 -0
  57. smftools/informatics/helpers/generate_converted_FASTA.py +99 -0
  58. smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
  59. smftools/informatics/helpers/get_native_references.py +28 -0
  60. smftools/informatics/helpers/index_fasta.py +12 -0
  61. smftools/informatics/helpers/make_dirs.py +21 -0
  62. smftools/informatics/helpers/make_modbed.py +27 -0
  63. smftools/informatics/helpers/modQC.py +27 -0
  64. smftools/informatics/helpers/modcall.py +36 -0
  65. smftools/informatics/helpers/modkit_extract_to_adata.py +887 -0
  66. smftools/informatics/helpers/ohe_batching.py +76 -0
  67. smftools/informatics/helpers/ohe_layers_decode.py +32 -0
  68. smftools/informatics/helpers/one_hot_decode.py +27 -0
  69. smftools/informatics/helpers/one_hot_encode.py +57 -0
  70. smftools/informatics/helpers/plot_bed_histograms.py +269 -0
  71. smftools/informatics/helpers/run_multiqc.py +28 -0
  72. smftools/informatics/helpers/separate_bam_by_bc.py +43 -0
  73. smftools/informatics/helpers/split_and_index_BAM.py +32 -0
  74. smftools/informatics/readwrite.py +106 -0
  75. smftools/informatics/subsample_fasta_from_bed.py +47 -0
  76. smftools/informatics/subsample_pod5.py +104 -0
  77. smftools/load_adata.py +1346 -0
  78. smftools/machine_learning/__init__.py +12 -0
  79. smftools/machine_learning/data/__init__.py +2 -0
  80. smftools/machine_learning/data/anndata_data_module.py +234 -0
  81. smftools/machine_learning/data/preprocessing.py +6 -0
  82. smftools/machine_learning/evaluation/__init__.py +2 -0
  83. smftools/machine_learning/evaluation/eval_utils.py +31 -0
  84. smftools/machine_learning/evaluation/evaluators.py +223 -0
  85. smftools/machine_learning/inference/__init__.py +3 -0
  86. smftools/machine_learning/inference/inference_utils.py +27 -0
  87. smftools/machine_learning/inference/lightning_inference.py +68 -0
  88. smftools/machine_learning/inference/sklearn_inference.py +55 -0
  89. smftools/machine_learning/inference/sliding_window_inference.py +114 -0
  90. smftools/machine_learning/models/__init__.py +9 -0
  91. smftools/machine_learning/models/base.py +295 -0
  92. smftools/machine_learning/models/cnn.py +138 -0
  93. smftools/machine_learning/models/lightning_base.py +345 -0
  94. smftools/machine_learning/models/mlp.py +26 -0
  95. smftools/machine_learning/models/positional.py +18 -0
  96. smftools/machine_learning/models/rnn.py +17 -0
  97. smftools/machine_learning/models/sklearn_models.py +273 -0
  98. smftools/machine_learning/models/transformer.py +303 -0
  99. smftools/machine_learning/models/wrappers.py +20 -0
  100. smftools/machine_learning/training/__init__.py +2 -0
  101. smftools/machine_learning/training/train_lightning_model.py +135 -0
  102. smftools/machine_learning/training/train_sklearn_model.py +114 -0
  103. smftools/machine_learning/utils/__init__.py +2 -0
  104. smftools/machine_learning/utils/device.py +10 -0
  105. smftools/machine_learning/utils/grl.py +14 -0
  106. smftools/plotting/__init__.py +18 -0
  107. smftools/plotting/autocorrelation_plotting.py +611 -0
  108. smftools/plotting/classifiers.py +355 -0
  109. smftools/plotting/general_plotting.py +682 -0
  110. smftools/plotting/hmm_plotting.py +260 -0
  111. smftools/plotting/position_stats.py +462 -0
  112. smftools/plotting/qc_plotting.py +270 -0
  113. smftools/preprocessing/__init__.py +38 -0
  114. smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
  115. smftools/preprocessing/append_base_context.py +122 -0
  116. smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
  117. smftools/preprocessing/archives/mark_duplicates.py +146 -0
  118. smftools/preprocessing/archives/preprocessing.py +614 -0
  119. smftools/preprocessing/archives/remove_duplicates.py +21 -0
  120. smftools/preprocessing/binarize_on_Youden.py +45 -0
  121. smftools/preprocessing/binary_layers_to_ohe.py +40 -0
  122. smftools/preprocessing/calculate_complexity.py +72 -0
  123. smftools/preprocessing/calculate_complexity_II.py +248 -0
  124. smftools/preprocessing/calculate_consensus.py +47 -0
  125. smftools/preprocessing/calculate_coverage.py +51 -0
  126. smftools/preprocessing/calculate_pairwise_differences.py +49 -0
  127. smftools/preprocessing/calculate_pairwise_hamming_distances.py +27 -0
  128. smftools/preprocessing/calculate_position_Youden.py +115 -0
  129. smftools/preprocessing/calculate_read_length_stats.py +79 -0
  130. smftools/preprocessing/calculate_read_modification_stats.py +101 -0
  131. smftools/preprocessing/clean_NaN.py +62 -0
  132. smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
  133. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
  134. smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
  135. smftools/preprocessing/flag_duplicate_reads.py +1351 -0
  136. smftools/preprocessing/invert_adata.py +37 -0
  137. smftools/preprocessing/load_sample_sheet.py +53 -0
  138. smftools/preprocessing/make_dirs.py +21 -0
  139. smftools/preprocessing/min_non_diagonal.py +25 -0
  140. smftools/preprocessing/recipes.py +127 -0
  141. smftools/preprocessing/subsample_adata.py +58 -0
  142. smftools/readwrite.py +1004 -0
  143. smftools/tools/__init__.py +20 -0
  144. smftools/tools/archived/apply_hmm.py +202 -0
  145. smftools/tools/archived/classifiers.py +787 -0
  146. smftools/tools/archived/classify_methylated_features.py +66 -0
  147. smftools/tools/archived/classify_non_methylated_features.py +75 -0
  148. smftools/tools/archived/subset_adata_v1.py +32 -0
  149. smftools/tools/archived/subset_adata_v2.py +46 -0
  150. smftools/tools/calculate_umap.py +62 -0
  151. smftools/tools/cluster_adata_on_methylation.py +105 -0
  152. smftools/tools/general_tools.py +69 -0
  153. smftools/tools/position_stats.py +601 -0
  154. smftools/tools/read_stats.py +184 -0
  155. smftools/tools/spatial_autocorrelation.py +562 -0
  156. smftools/tools/subset_adata.py +28 -0
  157. {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/METADATA +9 -2
  158. smftools-0.2.1.dist-info/RECORD +161 -0
  159. smftools-0.2.1.dist-info/entry_points.txt +2 -0
  160. smftools-0.1.6.dist-info/RECORD +0 -4
  161. {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/WHEEL +0 -0
  162. {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,260 @@
1
+ import math
2
+ from typing import List, Optional, Tuple, Union
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ from matplotlib.backends.backend_pdf import PdfPages
6
+
7
+ def plot_hmm_size_contours(
8
+ adata,
9
+ length_layer: str,
10
+ sample_col: str,
11
+ ref_obs_col: str,
12
+ rows_per_page: int = 4,
13
+ max_length_cap: Optional[int] = 1000,
14
+ figsize_per_cell: Tuple[float, float] = (4.0, 2.5),
15
+ cmap: str = "viridis",
16
+ log_scale_z: bool = False,
17
+ save_path: Optional[str] = None,
18
+ save_pdf: bool = True,
19
+ save_each_page: bool = False,
20
+ dpi: int = 150,
21
+ vmin: Optional[float] = None,
22
+ vmax: Optional[float] = None,
23
+ # ---------------- smoothing params ----------------
24
+ smoothing_sigma: Optional[Union[float, Tuple[float, float]]] = None,
25
+ normalize_after_smoothing: bool = True,
26
+ use_scipy_if_available: bool = True,
27
+ ):
28
+ """
29
+ Create contour/pcolormesh plots of P(length | position) using a length-encoded HMM layer.
30
+ Optional Gaussian smoothing applied to the 2D probability grid before plotting.
31
+
32
+ smoothing_sigma: None or 0 -> no smoothing.
33
+ float -> same sigma applied to (length_axis, position_axis)
34
+ (sigma_len, sigma_pos) -> separate sigmas.
35
+ normalize_after_smoothing: if True, renormalize each position-column to sum to 1 after smoothing.
36
+
37
+ Other args are the same as prior function.
38
+ """
39
+ # --- helper: gaussian smoothing (scipy fallback -> numpy separable conv) ---
40
+ def _gaussian_1d_kernel(sigma: float, eps: float = 1e-12):
41
+ if sigma <= 0 or sigma is None:
42
+ return np.array([1.0], dtype=float)
43
+ # choose kernel size = odd ~ 6*sigma (covers +/-3 sigma)
44
+ radius = max(1, int(math.ceil(3.0 * float(sigma))))
45
+ xs = np.arange(-radius, radius + 1, dtype=float)
46
+ k = np.exp(-(xs ** 2) / (2.0 * sigma ** 2))
47
+ k_sum = k.sum()
48
+ if k_sum <= eps:
49
+ k = np.array([1.0], dtype=float)
50
+ k_sum = 1.0
51
+ return k / k_sum
52
+
53
+ def _smooth_with_numpy_separable(Z: np.ndarray, sigma_len: float, sigma_pos: float) -> np.ndarray:
54
+ # Z shape: (n_lengths, n_positions)
55
+ out = Z.copy()
56
+ # smooth along length axis (axis=0)
57
+ if sigma_len and sigma_len > 0:
58
+ k_len = _gaussian_1d_kernel(sigma_len)
59
+ # convolve each column
60
+ out = np.apply_along_axis(lambda col: np.convolve(col, k_len, mode="same"), axis=0, arr=out)
61
+ # smooth along position axis (axis=1)
62
+ if sigma_pos and sigma_pos > 0:
63
+ k_pos = _gaussian_1d_kernel(sigma_pos)
64
+ out = np.apply_along_axis(lambda row: np.convolve(row, k_pos, mode="same"), axis=1, arr=out)
65
+ return out
66
+
67
+ # prefer scipy.ndimage if available (faster and better boundary handling)
68
+ _have_scipy = False
69
+ if use_scipy_if_available:
70
+ try:
71
+ from scipy.ndimage import gaussian_filter as _scipy_gaussian_filter
72
+ _have_scipy = True
73
+ except Exception:
74
+ _have_scipy = False
75
+
76
+ def _smooth_Z(Z: np.ndarray, sigma_len: float, sigma_pos: float) -> np.ndarray:
77
+ if (sigma_len is None or sigma_len == 0) and (sigma_pos is None or sigma_pos == 0):
78
+ return Z
79
+ if _have_scipy:
80
+ # scipy expects sigma sequence in axis order (axis=0 length, axis=1 pos)
81
+ sigma_seq = (float(sigma_len or 0.0), float(sigma_pos or 0.0))
82
+ return _scipy_gaussian_filter(Z, sigma=sigma_seq, mode="reflect")
83
+ else:
84
+ return _smooth_with_numpy_separable(Z, float(sigma_len or 0.0), float(sigma_pos or 0.0))
85
+
86
+ # --- gather unique ordered labels ---
87
+ samples = list(adata.obs[sample_col].cat.categories) if getattr(adata.obs[sample_col], "dtype", None) == "category" else list(pd.Categorical(adata.obs[sample_col]).categories)
88
+ refs = list(adata.obs[ref_obs_col].cat.categories) if getattr(adata.obs[ref_obs_col], "dtype", None) == "category" else list(pd.Categorical(adata.obs[ref_obs_col]).categories)
89
+
90
+ n_samples = len(samples)
91
+ n_refs = len(refs)
92
+ if n_samples == 0 or n_refs == 0:
93
+ raise ValueError("No samples or references found for plotting.")
94
+
95
+ # Try to get numeric coordinates for x axis; fallback to range indices
96
+ try:
97
+ coords = np.asarray(adata.var_names, dtype=int)
98
+ x_ticks_is_positions = True
99
+ except Exception:
100
+ coords = np.arange(adata.shape[1], dtype=int)
101
+ x_ticks_is_positions = False
102
+
103
+ # helper to get dense layer array for subset
104
+ def _get_layer_array(layer):
105
+ arr = layer
106
+ # sparse -> toarray
107
+ if hasattr(arr, "toarray"):
108
+ arr = arr.toarray()
109
+ return np.asarray(arr)
110
+
111
+ # fetch the whole layer once (not necessary but helps)
112
+ if length_layer not in adata.layers:
113
+ raise KeyError(f"Layer {length_layer} not found in adata.layers")
114
+ full_layer = _get_layer_array(adata.layers[length_layer]) # shape (n_obs, n_vars)
115
+
116
+ # Precompute pages
117
+ pages = math.ceil(n_samples / rows_per_page)
118
+ figs = []
119
+
120
+ # decide global max length to allocate y axis (cap to avoid huge memory)
121
+ observed_max_len = int(np.max(full_layer)) if full_layer.size > 0 else 0
122
+ if max_length_cap is None:
123
+ max_len = observed_max_len
124
+ else:
125
+ max_len = min(int(max_length_cap), max(1, observed_max_len))
126
+ if max_len < 1:
127
+ max_len = 1
128
+
129
+ # parse smoothing_sigma
130
+ if smoothing_sigma is None or smoothing_sigma == 0:
131
+ sigma_len, sigma_pos = 0.0, 0.0
132
+ elif isinstance(smoothing_sigma, (int, float)):
133
+ sigma_len = float(smoothing_sigma)
134
+ sigma_pos = float(smoothing_sigma)
135
+ else:
136
+ sigma_len = float(smoothing_sigma[0])
137
+ sigma_pos = float(smoothing_sigma[1])
138
+
139
+ # iterate pages
140
+ for p in range(pages):
141
+ start_sample = p * rows_per_page
142
+ end_sample = min(n_samples, (p + 1) * rows_per_page)
143
+ page_samples = samples[start_sample:end_sample]
144
+ rows_on_page = len(page_samples)
145
+
146
+ fig_w = n_refs * figsize_per_cell[0]
147
+ fig_h = rows_on_page * figsize_per_cell[1]
148
+ fig, axes = plt.subplots(rows_on_page, n_refs, figsize=(fig_w, fig_h), squeeze=False)
149
+ fig.suptitle(f"HMM size contours (page {p+1}/{pages})", fontsize=12)
150
+
151
+ # for each panel compute p(length | position)
152
+ for i_row, sample in enumerate(page_samples):
153
+ for j_col, ref in enumerate(refs):
154
+ ax = axes[i_row][j_col]
155
+ panel_mask = (adata.obs[sample_col] == sample) & (adata.obs[ref_obs_col] == ref)
156
+ if not panel_mask.any():
157
+ ax.text(0.5, 0.5, "no reads", ha="center", va="center")
158
+ ax.set_xticks([])
159
+ ax.set_yticks([])
160
+ ax.set_title(f"{sample} / {ref}")
161
+ continue
162
+
163
+ row_idx = np.nonzero(panel_mask.values if hasattr(panel_mask, "values") else np.asarray(panel_mask))[0]
164
+ if row_idx.size == 0:
165
+ ax.text(0.5, 0.5, "no reads", ha="center", va="center")
166
+ ax.set_title(f"{sample} / {ref}")
167
+ continue
168
+
169
+ sub = full_layer[row_idx, :] # (n_reads, n_positions)
170
+ if sub.size == 0:
171
+ ax.text(0.5, 0.5, "no data", ha="center", va="center")
172
+ ax.set_title(f"{sample} / {ref}")
173
+ continue
174
+
175
+ # compute counts per length per position
176
+ n_positions = sub.shape[1]
177
+ max_len_local = int(sub.max()) if sub.size > 0 else 0
178
+ max_len_here = min(max_len, max_len_local)
179
+
180
+ lengths_range = np.arange(1, max_len_here + 1, dtype=int)
181
+ Z = np.zeros((len(lengths_range), n_positions), dtype=float) # rows=length, cols=pos
182
+
183
+ # fill Z by efficient bincount across columns
184
+ for j in range(n_positions):
185
+ col_vals = sub[:, j]
186
+ pos_vals = col_vals[col_vals > 0].astype(int)
187
+ if pos_vals.size == 0:
188
+ continue
189
+ clipped = np.clip(pos_vals, 1, max_len_here)
190
+ counts = np.bincount(clipped, minlength=max_len_here + 1)[1:]
191
+ s = counts.sum()
192
+ if s > 0:
193
+ Z[:, j] = counts.astype(float) # keep counts for smoothing
194
+
195
+ # normalize per-column -> p(length | pos) BEFORE smoothing OR AFTER
196
+ # We'll smooth counts and then optionally renormalize (normalize_after_smoothing controls)
197
+ # Apply smoothing to Z (counts)
198
+ if sigma_len > 0 or sigma_pos > 0:
199
+ Z = _smooth_Z(Z, sigma_len, sigma_pos)
200
+
201
+ # normalize to conditional probability per column
202
+ if normalize_after_smoothing:
203
+ col_sums = Z.sum(axis=0, keepdims=True)
204
+ # avoid divide-by-zero
205
+ col_sums[col_sums == 0] = 1.0
206
+ Z = Z / col_sums
207
+
208
+ if log_scale_z:
209
+ Z_plot = np.log1p(Z)
210
+ else:
211
+ Z_plot = Z
212
+
213
+ # Build x and y grids for pcolormesh: x = coords (positions)
214
+ x = coords[:n_positions]
215
+ if n_positions >= 2:
216
+ dx = np.diff(x).mean()
217
+ x_edges = np.concatenate([x - dx / 2.0, [x[-1] + dx / 2.0]])
218
+ else:
219
+ x_edges = np.array([x[0] - 0.5, x[0] + 0.5])
220
+
221
+ y = lengths_range
222
+ dy = 1.0
223
+ y_edges = np.concatenate([y - 0.5, [y[-1] + 0.5]])
224
+
225
+ pcm = ax.pcolormesh(x_edges, y_edges, Z_plot, cmap=cmap, shading="auto", vmin=vmin, vmax=vmax)
226
+ ax.set_title(f"{sample} / {ref}")
227
+ ax.set_ylabel("length")
228
+ if i_row == rows_on_page - 1:
229
+ ax.set_xlabel("position")
230
+ else:
231
+ ax.set_xticklabels([])
232
+
233
+ # colorbar
234
+ fig.subplots_adjust(right=0.88)
235
+ cax = fig.add_axes([0.9, 0.15, 0.02, 0.7])
236
+ try:
237
+ fig.colorbar(pcm, cax=cax)
238
+ except Exception:
239
+ pass
240
+
241
+ figs.append(fig)
242
+
243
+ # saving per page if requested
244
+ if save_path is not None:
245
+ import os
246
+ os.makedirs(save_path, exist_ok=True)
247
+ if save_each_page:
248
+ fname = f"hmm_size_page_{p+1:03d}.png"
249
+ out = os.path.join(save_path, fname)
250
+ fig.savefig(out, dpi=dpi, bbox_inches="tight")
251
+
252
+ # multipage PDF if requested
253
+ if save_path is not None and save_pdf:
254
+ pdf_file = os.path.join(save_path, "hmm_size_contours_pages.pdf")
255
+ with PdfPages(pdf_file) as pp:
256
+ for fig in figs:
257
+ pp.savefig(fig, bbox_inches="tight")
258
+ print(f"Saved multipage PDF: {pdf_file}")
259
+
260
+ return figs