smftools 0.2.5__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. smftools/__init__.py +39 -7
  2. smftools/_settings.py +2 -0
  3. smftools/_version.py +3 -1
  4. smftools/cli/__init__.py +1 -0
  5. smftools/cli/archived/cli_flows.py +2 -0
  6. smftools/cli/helpers.py +34 -6
  7. smftools/cli/hmm_adata.py +239 -33
  8. smftools/cli/latent_adata.py +318 -0
  9. smftools/cli/load_adata.py +167 -131
  10. smftools/cli/preprocess_adata.py +180 -53
  11. smftools/cli/spatial_adata.py +152 -100
  12. smftools/cli_entry.py +38 -1
  13. smftools/config/__init__.py +2 -0
  14. smftools/config/conversion.yaml +11 -1
  15. smftools/config/default.yaml +42 -2
  16. smftools/config/experiment_config.py +59 -1
  17. smftools/constants.py +65 -0
  18. smftools/datasets/__init__.py +2 -0
  19. smftools/hmm/HMM.py +97 -3
  20. smftools/hmm/__init__.py +24 -13
  21. smftools/hmm/archived/apply_hmm_batched.py +2 -0
  22. smftools/hmm/archived/calculate_distances.py +2 -0
  23. smftools/hmm/archived/call_hmm_peaks.py +2 -0
  24. smftools/hmm/archived/train_hmm.py +2 -0
  25. smftools/hmm/call_hmm_peaks.py +5 -2
  26. smftools/hmm/display_hmm.py +4 -1
  27. smftools/hmm/hmm_readwrite.py +7 -2
  28. smftools/hmm/nucleosome_hmm_refinement.py +2 -0
  29. smftools/informatics/__init__.py +59 -34
  30. smftools/informatics/archived/bam_conversion.py +2 -0
  31. smftools/informatics/archived/bam_direct.py +2 -0
  32. smftools/informatics/archived/basecall_pod5s.py +2 -0
  33. smftools/informatics/archived/basecalls_to_adata.py +2 -0
  34. smftools/informatics/archived/conversion_smf.py +2 -0
  35. smftools/informatics/archived/deaminase_smf.py +1 -0
  36. smftools/informatics/archived/direct_smf.py +2 -0
  37. smftools/informatics/archived/fast5_to_pod5.py +2 -0
  38. smftools/informatics/archived/helpers/archived/__init__.py +2 -0
  39. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +2 -0
  40. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
  41. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
  42. smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
  43. smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
  44. smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
  45. smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
  46. smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
  47. smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
  48. smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
  49. smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
  50. smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
  51. smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
  52. smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
  53. smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
  54. smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
  55. smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
  56. smftools/informatics/archived/helpers/archived/informatics.py +2 -0
  57. smftools/informatics/archived/helpers/archived/load_adata.py +2 -0
  58. smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
  59. smftools/informatics/archived/helpers/archived/modQC.py +2 -0
  60. smftools/informatics/archived/helpers/archived/modcall.py +2 -0
  61. smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
  62. smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
  63. smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
  64. smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
  65. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +2 -0
  66. smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
  67. smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
  68. smftools/informatics/archived/print_bam_query_seq.py +2 -0
  69. smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
  70. smftools/informatics/archived/subsample_pod5.py +2 -0
  71. smftools/informatics/bam_functions.py +1093 -176
  72. smftools/informatics/basecalling.py +2 -0
  73. smftools/informatics/bed_functions.py +271 -61
  74. smftools/informatics/binarize_converted_base_identities.py +3 -0
  75. smftools/informatics/complement_base_list.py +2 -0
  76. smftools/informatics/converted_BAM_to_adata.py +641 -176
  77. smftools/informatics/fasta_functions.py +94 -10
  78. smftools/informatics/h5ad_functions.py +123 -4
  79. smftools/informatics/modkit_extract_to_adata.py +1019 -431
  80. smftools/informatics/modkit_functions.py +2 -0
  81. smftools/informatics/ohe.py +2 -0
  82. smftools/informatics/pod5_functions.py +3 -2
  83. smftools/informatics/sequence_encoding.py +72 -0
  84. smftools/logging_utils.py +21 -2
  85. smftools/machine_learning/__init__.py +22 -6
  86. smftools/machine_learning/data/__init__.py +2 -0
  87. smftools/machine_learning/data/anndata_data_module.py +18 -4
  88. smftools/machine_learning/data/preprocessing.py +2 -0
  89. smftools/machine_learning/evaluation/__init__.py +2 -0
  90. smftools/machine_learning/evaluation/eval_utils.py +2 -0
  91. smftools/machine_learning/evaluation/evaluators.py +14 -9
  92. smftools/machine_learning/inference/__init__.py +2 -0
  93. smftools/machine_learning/inference/inference_utils.py +2 -0
  94. smftools/machine_learning/inference/lightning_inference.py +6 -1
  95. smftools/machine_learning/inference/sklearn_inference.py +2 -0
  96. smftools/machine_learning/inference/sliding_window_inference.py +2 -0
  97. smftools/machine_learning/models/__init__.py +2 -0
  98. smftools/machine_learning/models/base.py +7 -2
  99. smftools/machine_learning/models/cnn.py +7 -2
  100. smftools/machine_learning/models/lightning_base.py +16 -11
  101. smftools/machine_learning/models/mlp.py +5 -1
  102. smftools/machine_learning/models/positional.py +7 -2
  103. smftools/machine_learning/models/rnn.py +5 -1
  104. smftools/machine_learning/models/sklearn_models.py +14 -9
  105. smftools/machine_learning/models/transformer.py +7 -2
  106. smftools/machine_learning/models/wrappers.py +6 -2
  107. smftools/machine_learning/training/__init__.py +2 -0
  108. smftools/machine_learning/training/train_lightning_model.py +13 -3
  109. smftools/machine_learning/training/train_sklearn_model.py +2 -0
  110. smftools/machine_learning/utils/__init__.py +2 -0
  111. smftools/machine_learning/utils/device.py +5 -1
  112. smftools/machine_learning/utils/grl.py +5 -1
  113. smftools/metadata.py +1 -1
  114. smftools/optional_imports.py +31 -0
  115. smftools/plotting/__init__.py +41 -31
  116. smftools/plotting/autocorrelation_plotting.py +9 -5
  117. smftools/plotting/classifiers.py +16 -4
  118. smftools/plotting/general_plotting.py +2415 -629
  119. smftools/plotting/hmm_plotting.py +97 -9
  120. smftools/plotting/position_stats.py +15 -7
  121. smftools/plotting/qc_plotting.py +6 -1
  122. smftools/preprocessing/__init__.py +36 -37
  123. smftools/preprocessing/append_base_context.py +17 -17
  124. smftools/preprocessing/append_mismatch_frequency_sites.py +158 -0
  125. smftools/preprocessing/archived/add_read_length_and_mapping_qc.py +2 -0
  126. smftools/preprocessing/archived/calculate_complexity.py +2 -0
  127. smftools/preprocessing/archived/mark_duplicates.py +2 -0
  128. smftools/preprocessing/archived/preprocessing.py +2 -0
  129. smftools/preprocessing/archived/remove_duplicates.py +2 -0
  130. smftools/preprocessing/binary_layers_to_ohe.py +2 -1
  131. smftools/preprocessing/calculate_complexity_II.py +4 -1
  132. smftools/preprocessing/calculate_consensus.py +1 -1
  133. smftools/preprocessing/calculate_pairwise_differences.py +2 -0
  134. smftools/preprocessing/calculate_pairwise_hamming_distances.py +3 -0
  135. smftools/preprocessing/calculate_position_Youden.py +9 -2
  136. smftools/preprocessing/calculate_read_modification_stats.py +6 -1
  137. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +2 -0
  138. smftools/preprocessing/filter_reads_on_modification_thresholds.py +2 -0
  139. smftools/preprocessing/flag_duplicate_reads.py +42 -54
  140. smftools/preprocessing/make_dirs.py +2 -1
  141. smftools/preprocessing/min_non_diagonal.py +2 -0
  142. smftools/preprocessing/recipes.py +2 -0
  143. smftools/readwrite.py +53 -17
  144. smftools/schema/anndata_schema_v1.yaml +15 -1
  145. smftools/tools/__init__.py +30 -18
  146. smftools/tools/archived/apply_hmm.py +2 -0
  147. smftools/tools/archived/classifiers.py +2 -0
  148. smftools/tools/archived/classify_methylated_features.py +2 -0
  149. smftools/tools/archived/classify_non_methylated_features.py +2 -0
  150. smftools/tools/archived/subset_adata_v1.py +2 -0
  151. smftools/tools/archived/subset_adata_v2.py +2 -0
  152. smftools/tools/calculate_leiden.py +57 -0
  153. smftools/tools/calculate_nmf.py +119 -0
  154. smftools/tools/calculate_umap.py +93 -8
  155. smftools/tools/cluster_adata_on_methylation.py +7 -1
  156. smftools/tools/position_stats.py +17 -27
  157. smftools/tools/rolling_nn_distance.py +235 -0
  158. smftools/tools/tensor_factorization.py +169 -0
  159. {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/METADATA +69 -33
  160. smftools-0.3.1.dist-info/RECORD +189 -0
  161. smftools-0.2.5.dist-info/RECORD +0 -181
  162. {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/WHEEL +0 -0
  163. {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/entry_points.txt +0 -0
  164. {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,318 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from pathlib import Path
5
+ from typing import Optional, Tuple
6
+
7
+ import anndata as ad
8
+
9
+ from smftools.constants import LATENT_DIR, LOGGING_DIR, SEQUENCE_INTEGER_ENCODING
10
+ from smftools.logging_utils import get_logger, setup_logging
11
+
12
+ logger = get_logger(__name__)
13
+
14
+
15
+ def latent_adata(
16
+ config_path: str,
17
+ ) -> Tuple[Optional[ad.AnnData], Optional[Path]]:
18
+ """
19
+ CLI-facing wrapper for representation learning.
20
+
21
+ Called by: `smftools latent <config_path>`
22
+
23
+ Responsibilities:
24
+ - Determine which AnnData stages exist (pp, pp_dedup, spatial, hmm).
25
+ - Call `latent_adata_core(...)` when actual work is needed.
26
+
27
+ Returns
28
+ -------
29
+ latent_adata : AnnData | None
30
+ AnnData with latent analyses, or None if we skipped because a later-stage
31
+ AnnData already exists.
32
+ latent_adata_path : Path | None
33
+ Path to the “current” latent AnnData.
34
+ """
35
+ from ..readwrite import add_or_update_column_in_csv, safe_read_h5ad
36
+ from .helpers import get_adata_paths, load_experiment_config
37
+
38
+ # 1) Ensure config + basic paths via load_adata
39
+ cfg = load_experiment_config(config_path)
40
+
41
+ paths = get_adata_paths(cfg)
42
+
43
+ pp_path = paths.pp
44
+ pp_dedup_path = paths.pp_dedup
45
+ spatial_path = paths.spatial
46
+ hmm_path = paths.hmm
47
+ latent_path = paths.latent
48
+
49
+ # Stage-skipping logic for latent
50
+ if not getattr(cfg, "force_redo_latent_analyses", False):
51
+ # If latent exists, we consider latent analyses already done.
52
+ if latent_path.exists():
53
+ logger.info(f"Latent AnnData found: {latent_path}\nSkipping smftools latent")
54
+ return None, latent_path
55
+
56
+ # Helper to load from disk, reusing loaded_adata if it matches
57
+ def _load(path: Path):
58
+ adata, _ = safe_read_h5ad(path)
59
+ return adata
60
+
61
+ # 3) Decide which AnnData to use as the *starting point* for latent analyses
62
+ if latent_path.exists():
63
+ start_adata = _load(latent_path)
64
+ source_path = latent_path
65
+ elif hmm_path.exists():
66
+ start_adata = _load(hmm_path)
67
+ source_path = hmm_path
68
+ elif spatial_path.exists():
69
+ start_adata = _load(spatial_path)
70
+ source_path = spatial_path
71
+ elif pp_dedup_path.exists():
72
+ start_adata = _load(pp_dedup_path)
73
+ source_path = pp_dedup_path
74
+ elif pp_path.exists():
75
+ start_adata = _load(pp_path)
76
+ source_path = pp_path
77
+ else:
78
+ logger.warning(
79
+ "No suitable AnnData found for latent analyses (need at least preprocessed)."
80
+ )
81
+ return None, None
82
+
83
+ # 4) Run the latent core
84
+ adata_latent, latent_path = latent_adata_core(
85
+ adata=start_adata,
86
+ cfg=cfg,
87
+ paths=paths,
88
+ source_adata_path=source_path,
89
+ config_path=config_path,
90
+ )
91
+
92
+ return adata_latent, latent_path
93
+
94
+
95
+ def latent_adata_core(
96
+ adata: ad.AnnData,
97
+ cfg,
98
+ paths: AdataPaths,
99
+ source_adata_path: Optional[Path] = None,
100
+ config_path: Optional[str] = None,
101
+ ) -> Tuple[ad.AnnData, Path]:
102
+ """
103
+ Core spatial analysis pipeline.
104
+
105
+ Assumes:
106
+ - `adata` is (typically) the preprocessed, duplicate-removed AnnData.
107
+ - `cfg` is the ExperimentConfig.
108
+
109
+ Does:
110
+ - Optional sample sheet load.
111
+ - Optional inversion & reindexing.
112
+ - PCA/UMAP/Leiden.
113
+ - Save latent AnnData to `latent_adata_path`.
114
+
115
+ Returns
116
+ -------
117
+ adata : AnnData
118
+ analyzed AnnData (same object, modified in-place).
119
+ adata_path : Path
120
+ Path where AnnData was written.
121
+ """
122
+ import os
123
+ import warnings
124
+ from datetime import datetime
125
+ from pathlib import Path
126
+
127
+ import numpy as np
128
+ import pandas as pd
129
+
130
+ from ..metadata import record_smftools_metadata
131
+ from ..plotting import (
132
+ plot_cp_sequence_components,
133
+ plot_embedding,
134
+ plot_nmf_components,
135
+ plot_pca,
136
+ plot_umap,
137
+ )
138
+ from ..preprocessing import (
139
+ invert_adata,
140
+ load_sample_sheet,
141
+ reindex_references_adata,
142
+ )
143
+ from ..readwrite import make_dirs, safe_read_h5ad
144
+ from ..tools import (
145
+ calculate_leiden,
146
+ calculate_nmf,
147
+ calculate_sequence_cp_decomposition,
148
+ calculate_umap,
149
+ )
150
+ from .helpers import write_gz_h5ad
151
+
152
+ # -----------------------------
153
+ # General setup
154
+ # -----------------------------
155
+ date_str = datetime.today().strftime("%y%m%d")
156
+ now = datetime.now()
157
+ time_str = now.strftime("%H%M%S")
158
+ log_level = getattr(logging, cfg.log_level.upper(), logging.INFO)
159
+
160
+ latent_adata_path = paths.latent
161
+
162
+ output_directory = Path(cfg.output_directory)
163
+ latent_directory = output_directory / LATENT_DIR
164
+ logging_directory = latent_directory / LOGGING_DIR
165
+
166
+ make_dirs([output_directory, latent_directory])
167
+
168
+ if cfg.emit_log_file:
169
+ log_file = logging_directory / f"{date_str}_{time_str}_log.log"
170
+ make_dirs([logging_directory])
171
+ else:
172
+ log_file = None
173
+
174
+ setup_logging(level=log_level, log_file=log_file, reconfigure=log_file is not None)
175
+
176
+ smf_modality = cfg.smf_modality
177
+ if smf_modality == "conversion":
178
+ deaminase = False
179
+ else:
180
+ deaminase = True
181
+
182
+ # -----------------------------
183
+ # Optional sample sheet metadata
184
+ # -----------------------------
185
+ if getattr(cfg, "sample_sheet_path", None):
186
+ load_sample_sheet(
187
+ adata,
188
+ cfg.sample_sheet_path,
189
+ mapping_key_column=cfg.sample_sheet_mapping_column,
190
+ as_category=True,
191
+ force_reload=cfg.force_reload_sample_sheet,
192
+ )
193
+
194
+ # -----------------------------
195
+ # Optional inversion along positions axis
196
+ # -----------------------------
197
+ if getattr(cfg, "invert_adata", False):
198
+ adata = invert_adata(adata)
199
+
200
+ # -----------------------------
201
+ # Optional reindexing by reference
202
+ # -----------------------------
203
+ reindex_references_adata(
204
+ adata,
205
+ reference_col=cfg.reference_column,
206
+ offsets=cfg.reindexing_offsets,
207
+ new_col=cfg.reindexed_var_suffix,
208
+ )
209
+
210
+ if adata.uns.get("reindex_references_adata_performed", False):
211
+ reindex_suffix = cfg.reindexed_var_suffix
212
+ else:
213
+ reindex_suffix = None
214
+
215
+ references = adata.obs[cfg.reference_column].cat.categories
216
+
217
+ # ============================================================
218
+ # 2) PCA/UMAP on *deduplicated* preprocessed AnnData
219
+ # ============================================================
220
+ latent_dir_dedup = latent_directory / "deduplicated"
221
+ umap_dir = latent_dir_dedup / "07_umaps"
222
+ nmf_dir = latent_dir_dedup / "07b_nmf"
223
+ nmf_sequence_dir = latent_dir_dedup / "07c_nmf_sequence"
224
+
225
+ var_filters = []
226
+ if smf_modality == "direct":
227
+ for ref in references:
228
+ for base in cfg.mod_target_bases:
229
+ var_filters.append(f"{ref}_{base}_site")
230
+ elif deaminase:
231
+ for ref in references:
232
+ var_filters.append(f"{ref}_C_site")
233
+ else:
234
+ for ref in references:
235
+ for base in cfg.mod_target_bases:
236
+ var_filters.append(f"{ref}_{base}_site")
237
+
238
+ # UMAP / Leiden
239
+ if umap_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
240
+ logger.debug(f"{umap_dir} already exists. Skipping UMAP plotting.")
241
+ else:
242
+ make_dirs([umap_dir])
243
+
244
+ adata = calculate_umap(
245
+ adata,
246
+ layer=cfg.layer_for_umap_plotting,
247
+ var_filters=var_filters,
248
+ n_pcs=10,
249
+ knn_neighbors=15,
250
+ )
251
+
252
+ calculate_leiden(adata, resolution=0.1)
253
+
254
+ umap_layers = ["leiden", cfg.sample_name_col_for_plotting, "Reference_strand"]
255
+ umap_layers += cfg.umap_layers_to_plot
256
+ plot_umap(adata, color=umap_layers, output_dir=umap_dir)
257
+ plot_pca(adata, color=umap_layers, output_dir=umap_dir)
258
+
259
+ # NMF
260
+ if nmf_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
261
+ logger.debug(f"{nmf_dir} already exists. Skipping NMF plotting.")
262
+ else:
263
+ make_dirs([nmf_dir])
264
+ adata = calculate_nmf(
265
+ adata,
266
+ layer=cfg.layer_for_umap_plotting,
267
+ var_filters=var_filters,
268
+ n_components=5,
269
+ )
270
+ nmf_layers = ["leiden", cfg.sample_name_col_for_plotting, "Reference_strand"]
271
+ nmf_layers += cfg.umap_layers_to_plot
272
+ plot_embedding(adata, basis="nmf", color=nmf_layers, output_dir=nmf_dir)
273
+ plot_nmf_components(adata, output_dir=nmf_dir)
274
+
275
+ # CP decomposition using sequence integer encoding (no var filters)
276
+ if nmf_sequence_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
277
+ logger.debug(f"{nmf_sequence_dir} already exists. Skipping sequence CP plotting.")
278
+ elif SEQUENCE_INTEGER_ENCODING not in adata.layers:
279
+ logger.warning(
280
+ "Layer %s not found; skipping sequence integer encoding CP.",
281
+ SEQUENCE_INTEGER_ENCODING,
282
+ )
283
+ else:
284
+ make_dirs([nmf_sequence_dir])
285
+ adata = calculate_sequence_cp_decomposition(
286
+ adata,
287
+ layer=SEQUENCE_INTEGER_ENCODING,
288
+ rank=5,
289
+ embedding_key="X_cp_sequence",
290
+ components_key="H_cp_sequence",
291
+ uns_key="cp_sequence",
292
+ )
293
+ nmf_layers = ["leiden", cfg.sample_name_col_for_plotting, "Reference_strand"]
294
+ nmf_layers += cfg.umap_layers_to_plot
295
+ plot_embedding(adata, basis="cp_sequence", color=nmf_layers, output_dir=nmf_sequence_dir)
296
+ plot_cp_sequence_components(
297
+ adata,
298
+ output_dir=nmf_sequence_dir,
299
+ components_key="H_cp_sequence",
300
+ uns_key="cp_sequence",
301
+ )
302
+
303
+ # ============================================================
304
+ # 5) Save latent AnnData
305
+ # ============================================================
306
+ if (not latent_adata_path.exists()) or getattr(cfg, "force_redo_latent_analyses", False):
307
+ logger.info("Saving latent analyzed AnnData (post preprocessing and duplicate removal).")
308
+ record_smftools_metadata(
309
+ adata,
310
+ step_name="latent",
311
+ cfg=cfg,
312
+ config_path=config_path,
313
+ input_paths=[source_adata_path] if source_adata_path else None,
314
+ output_path=latent_adata_path,
315
+ )
316
+ write_gz_h5ad(adata, latent_adata_path)
317
+
318
+ return adata, latent_adata_path