smftools 0.2.5__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. smftools/__init__.py +39 -7
  2. smftools/_settings.py +2 -0
  3. smftools/_version.py +3 -1
  4. smftools/cli/__init__.py +1 -0
  5. smftools/cli/archived/cli_flows.py +2 -0
  6. smftools/cli/helpers.py +34 -6
  7. smftools/cli/hmm_adata.py +239 -33
  8. smftools/cli/latent_adata.py +318 -0
  9. smftools/cli/load_adata.py +167 -131
  10. smftools/cli/preprocess_adata.py +180 -53
  11. smftools/cli/spatial_adata.py +152 -100
  12. smftools/cli_entry.py +38 -1
  13. smftools/config/__init__.py +2 -0
  14. smftools/config/conversion.yaml +11 -1
  15. smftools/config/default.yaml +42 -2
  16. smftools/config/experiment_config.py +59 -1
  17. smftools/constants.py +65 -0
  18. smftools/datasets/__init__.py +2 -0
  19. smftools/hmm/HMM.py +97 -3
  20. smftools/hmm/__init__.py +24 -13
  21. smftools/hmm/archived/apply_hmm_batched.py +2 -0
  22. smftools/hmm/archived/calculate_distances.py +2 -0
  23. smftools/hmm/archived/call_hmm_peaks.py +2 -0
  24. smftools/hmm/archived/train_hmm.py +2 -0
  25. smftools/hmm/call_hmm_peaks.py +5 -2
  26. smftools/hmm/display_hmm.py +4 -1
  27. smftools/hmm/hmm_readwrite.py +7 -2
  28. smftools/hmm/nucleosome_hmm_refinement.py +2 -0
  29. smftools/informatics/__init__.py +59 -34
  30. smftools/informatics/archived/bam_conversion.py +2 -0
  31. smftools/informatics/archived/bam_direct.py +2 -0
  32. smftools/informatics/archived/basecall_pod5s.py +2 -0
  33. smftools/informatics/archived/basecalls_to_adata.py +2 -0
  34. smftools/informatics/archived/conversion_smf.py +2 -0
  35. smftools/informatics/archived/deaminase_smf.py +1 -0
  36. smftools/informatics/archived/direct_smf.py +2 -0
  37. smftools/informatics/archived/fast5_to_pod5.py +2 -0
  38. smftools/informatics/archived/helpers/archived/__init__.py +2 -0
  39. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +2 -0
  40. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
  41. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
  42. smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
  43. smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
  44. smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
  45. smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
  46. smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
  47. smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
  48. smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
  49. smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
  50. smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
  51. smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
  52. smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
  53. smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
  54. smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
  55. smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
  56. smftools/informatics/archived/helpers/archived/informatics.py +2 -0
  57. smftools/informatics/archived/helpers/archived/load_adata.py +2 -0
  58. smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
  59. smftools/informatics/archived/helpers/archived/modQC.py +2 -0
  60. smftools/informatics/archived/helpers/archived/modcall.py +2 -0
  61. smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
  62. smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
  63. smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
  64. smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
  65. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +2 -0
  66. smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
  67. smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
  68. smftools/informatics/archived/print_bam_query_seq.py +2 -0
  69. smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
  70. smftools/informatics/archived/subsample_pod5.py +2 -0
  71. smftools/informatics/bam_functions.py +1093 -176
  72. smftools/informatics/basecalling.py +2 -0
  73. smftools/informatics/bed_functions.py +271 -61
  74. smftools/informatics/binarize_converted_base_identities.py +3 -0
  75. smftools/informatics/complement_base_list.py +2 -0
  76. smftools/informatics/converted_BAM_to_adata.py +641 -176
  77. smftools/informatics/fasta_functions.py +94 -10
  78. smftools/informatics/h5ad_functions.py +123 -4
  79. smftools/informatics/modkit_extract_to_adata.py +1019 -431
  80. smftools/informatics/modkit_functions.py +2 -0
  81. smftools/informatics/ohe.py +2 -0
  82. smftools/informatics/pod5_functions.py +3 -2
  83. smftools/informatics/sequence_encoding.py +72 -0
  84. smftools/logging_utils.py +21 -2
  85. smftools/machine_learning/__init__.py +22 -6
  86. smftools/machine_learning/data/__init__.py +2 -0
  87. smftools/machine_learning/data/anndata_data_module.py +18 -4
  88. smftools/machine_learning/data/preprocessing.py +2 -0
  89. smftools/machine_learning/evaluation/__init__.py +2 -0
  90. smftools/machine_learning/evaluation/eval_utils.py +2 -0
  91. smftools/machine_learning/evaluation/evaluators.py +14 -9
  92. smftools/machine_learning/inference/__init__.py +2 -0
  93. smftools/machine_learning/inference/inference_utils.py +2 -0
  94. smftools/machine_learning/inference/lightning_inference.py +6 -1
  95. smftools/machine_learning/inference/sklearn_inference.py +2 -0
  96. smftools/machine_learning/inference/sliding_window_inference.py +2 -0
  97. smftools/machine_learning/models/__init__.py +2 -0
  98. smftools/machine_learning/models/base.py +7 -2
  99. smftools/machine_learning/models/cnn.py +7 -2
  100. smftools/machine_learning/models/lightning_base.py +16 -11
  101. smftools/machine_learning/models/mlp.py +5 -1
  102. smftools/machine_learning/models/positional.py +7 -2
  103. smftools/machine_learning/models/rnn.py +5 -1
  104. smftools/machine_learning/models/sklearn_models.py +14 -9
  105. smftools/machine_learning/models/transformer.py +7 -2
  106. smftools/machine_learning/models/wrappers.py +6 -2
  107. smftools/machine_learning/training/__init__.py +2 -0
  108. smftools/machine_learning/training/train_lightning_model.py +13 -3
  109. smftools/machine_learning/training/train_sklearn_model.py +2 -0
  110. smftools/machine_learning/utils/__init__.py +2 -0
  111. smftools/machine_learning/utils/device.py +5 -1
  112. smftools/machine_learning/utils/grl.py +5 -1
  113. smftools/metadata.py +1 -1
  114. smftools/optional_imports.py +31 -0
  115. smftools/plotting/__init__.py +41 -31
  116. smftools/plotting/autocorrelation_plotting.py +9 -5
  117. smftools/plotting/classifiers.py +16 -4
  118. smftools/plotting/general_plotting.py +2415 -629
  119. smftools/plotting/hmm_plotting.py +97 -9
  120. smftools/plotting/position_stats.py +15 -7
  121. smftools/plotting/qc_plotting.py +6 -1
  122. smftools/preprocessing/__init__.py +36 -37
  123. smftools/preprocessing/append_base_context.py +17 -17
  124. smftools/preprocessing/append_mismatch_frequency_sites.py +158 -0
  125. smftools/preprocessing/archived/add_read_length_and_mapping_qc.py +2 -0
  126. smftools/preprocessing/archived/calculate_complexity.py +2 -0
  127. smftools/preprocessing/archived/mark_duplicates.py +2 -0
  128. smftools/preprocessing/archived/preprocessing.py +2 -0
  129. smftools/preprocessing/archived/remove_duplicates.py +2 -0
  130. smftools/preprocessing/binary_layers_to_ohe.py +2 -1
  131. smftools/preprocessing/calculate_complexity_II.py +4 -1
  132. smftools/preprocessing/calculate_consensus.py +1 -1
  133. smftools/preprocessing/calculate_pairwise_differences.py +2 -0
  134. smftools/preprocessing/calculate_pairwise_hamming_distances.py +3 -0
  135. smftools/preprocessing/calculate_position_Youden.py +9 -2
  136. smftools/preprocessing/calculate_read_modification_stats.py +6 -1
  137. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +2 -0
  138. smftools/preprocessing/filter_reads_on_modification_thresholds.py +2 -0
  139. smftools/preprocessing/flag_duplicate_reads.py +42 -54
  140. smftools/preprocessing/make_dirs.py +2 -1
  141. smftools/preprocessing/min_non_diagonal.py +2 -0
  142. smftools/preprocessing/recipes.py +2 -0
  143. smftools/readwrite.py +53 -17
  144. smftools/schema/anndata_schema_v1.yaml +15 -1
  145. smftools/tools/__init__.py +30 -18
  146. smftools/tools/archived/apply_hmm.py +2 -0
  147. smftools/tools/archived/classifiers.py +2 -0
  148. smftools/tools/archived/classify_methylated_features.py +2 -0
  149. smftools/tools/archived/classify_non_methylated_features.py +2 -0
  150. smftools/tools/archived/subset_adata_v1.py +2 -0
  151. smftools/tools/archived/subset_adata_v2.py +2 -0
  152. smftools/tools/calculate_leiden.py +57 -0
  153. smftools/tools/calculate_nmf.py +119 -0
  154. smftools/tools/calculate_umap.py +93 -8
  155. smftools/tools/cluster_adata_on_methylation.py +7 -1
  156. smftools/tools/position_stats.py +17 -27
  157. smftools/tools/rolling_nn_distance.py +235 -0
  158. smftools/tools/tensor_factorization.py +169 -0
  159. {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/METADATA +69 -33
  160. smftools-0.3.1.dist-info/RECORD +189 -0
  161. smftools-0.2.5.dist-info/RECORD +0 -181
  162. {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/WHEEL +0 -0
  163. {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/entry_points.txt +0 -0
  164. {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/licenses/LICENSE +0 -0
smftools/__init__.py CHANGED
@@ -1,20 +1,52 @@
1
1
  """smftools"""
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import logging
4
6
  import warnings
7
+ from importlib import import_module
5
8
  from importlib.metadata import version
9
+ from typing import TYPE_CHECKING
6
10
 
7
- from . import cli, config, datasets, hmm
8
- from . import informatics as inform
9
- from . import machine_learning as ml
10
- from . import plotting as pl
11
- from . import preprocessing as pp
12
- from . import tools as tl
13
- from .readwrite import adata_to_df, merge_barcoded_anndatas_core, safe_read_h5ad, safe_write_h5ad
11
+ from .readwrite import adata_to_df, safe_read_h5ad, safe_write_h5ad
14
12
 
15
13
  package_name = "smftools"
16
14
  __version__ = version(package_name)
17
15
 
16
+ if TYPE_CHECKING:
17
+ from smftools import (
18
+ cli,
19
+ config,
20
+ datasets,
21
+ hmm,
22
+ informatics,
23
+ machine_learning,
24
+ plotting,
25
+ preprocessing,
26
+ tools,
27
+ )
28
+
29
+ _LAZY_MODULES = {
30
+ "cli": "smftools.cli",
31
+ "config": "smftools.config",
32
+ "datasets": "smftools.datasets",
33
+ "hmm": "smftools.hmm",
34
+ "inform": "smftools.informatics",
35
+ "ml": "smftools.machine_learning",
36
+ "pl": "smftools.plotting",
37
+ "pp": "smftools.preprocessing",
38
+ "tl": "smftools.tools",
39
+ }
40
+
41
+
42
+ def __getattr__(name: str):
43
+ if name in _LAZY_MODULES:
44
+ module = import_module(_LAZY_MODULES[name])
45
+ globals()[name] = module
46
+ return module
47
+ raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
48
+
49
+
18
50
  __all__ = [
19
51
  "adata_to_df",
20
52
  "inform",
smftools/_settings.py CHANGED
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  from pathlib import Path
2
4
  from typing import Union
3
5
 
smftools/_version.py CHANGED
@@ -1 +1,3 @@
1
- __version__ = "0.2.5"
1
+ from __future__ import annotations
2
+
3
+ __version__ = "0.3.1"
smftools/cli/__init__.py CHANGED
@@ -0,0 +1 @@
1
+ from __future__ import annotations
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  def flow_I(config_path):
2
4
  """
3
5
  High-level function to call for converting raw sequencing data to an adata object.
smftools/cli/helpers.py CHANGED
@@ -1,8 +1,12 @@
1
+ from __future__ import annotations
2
+
1
3
  from dataclasses import dataclass
2
4
  from pathlib import Path
3
5
 
4
6
  import anndata as ad
5
7
 
8
+ from smftools.constants import H5_DIR, HMM_DIR, LATENT_DIR, LOAD_DIR, PREPROCESS_DIR, SPATIAL_DIR
9
+
6
10
  from ..metadata import write_runtime_schema_yaml
7
11
  from ..readwrite import safe_write_h5ad
8
12
 
@@ -14,28 +18,35 @@ class AdataPaths:
14
18
  pp_dedup: Path
15
19
  spatial: Path
16
20
  hmm: Path
21
+ latent: Path
17
22
 
18
23
 
19
24
  def get_adata_paths(cfg) -> AdataPaths:
20
25
  """
21
26
  Central helper: given cfg, compute all standard AnnData paths.
22
27
  """
23
- h5_dir = Path(cfg.output_directory) / "h5ads"
28
+ output_directory = Path(cfg.output_directory)
24
29
 
25
- raw = h5_dir / f"{cfg.experiment_name}.h5ad.gz"
30
+ raw = output_directory / LOAD_DIR / H5_DIR / f"{cfg.experiment_name}.h5ad.gz"
26
31
 
27
- pp = h5_dir / f"{cfg.experiment_name}_preprocessed.h5ad.gz"
32
+ pp = output_directory / PREPROCESS_DIR / H5_DIR / f"{cfg.experiment_name}_preprocessed.h5ad.gz"
28
33
 
29
34
  if cfg.smf_modality == "direct":
30
35
  # direct SMF: duplicate-removed path is just preprocessed path
31
36
  pp_dedup = pp
32
37
  else:
33
- pp_dedup = h5_dir / f"{cfg.experiment_name}_preprocessed_duplicates_removed.h5ad.gz"
38
+ pp_dedup = (
39
+ output_directory
40
+ / PREPROCESS_DIR
41
+ / H5_DIR
42
+ / f"{cfg.experiment_name}_preprocessed_duplicates_removed.h5ad.gz"
43
+ )
34
44
 
35
45
  pp_dedup_base = pp_dedup.name.removesuffix(".h5ad.gz")
36
46
 
37
- spatial = h5_dir / f"{pp_dedup_base}_spatial.h5ad.gz"
38
- hmm = h5_dir / f"{pp_dedup_base}_spatial_hmm.h5ad.gz"
47
+ spatial = output_directory / SPATIAL_DIR / H5_DIR / f"{pp_dedup_base}_spatial.h5ad.gz"
48
+ hmm = output_directory / HMM_DIR / H5_DIR / f"{pp_dedup_base}_hmm.h5ad.gz"
49
+ latent = output_directory / LATENT_DIR / H5_DIR / f"{pp_dedup_base}_latent.h5ad.gz"
39
50
 
40
51
  return AdataPaths(
41
52
  raw=raw,
@@ -43,7 +54,24 @@ def get_adata_paths(cfg) -> AdataPaths:
43
54
  pp_dedup=pp_dedup,
44
55
  spatial=spatial,
45
56
  hmm=hmm,
57
+ latent=latent,
58
+ )
59
+
60
+
61
+ def load_experiment_config(config_path: str):
62
+ """Load ExperimentConfig without invoking any pipeline stages."""
63
+ from datetime import datetime
64
+ from importlib import resources
65
+
66
+ from ..config import ExperimentConfig, LoadExperimentConfig
67
+
68
+ date_str = datetime.today().strftime("%y%m%d")
69
+ loader = LoadExperimentConfig(config_path)
70
+ defaults_dir = resources.files("smftools").joinpath("config")
71
+ cfg, _ = ExperimentConfig.from_var_dict(
72
+ loader.var_dict, date_str=date_str, defaults_dir=defaults_dir
46
73
  )
74
+ return cfg
47
75
 
48
76
 
49
77
  def write_gz_h5ad(adata: ad.AnnData, path: Path) -> Path:
smftools/cli/hmm_adata.py CHANGED
@@ -1,25 +1,152 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import copy
4
+ import logging
4
5
  from dataclasses import dataclass
5
6
  from pathlib import Path
6
- from typing import Any, List, Optional, Sequence, Tuple, Union
7
+ from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Tuple, Union
7
8
 
8
9
  import numpy as np
9
- import torch
10
10
 
11
- from smftools.logging_utils import get_logger
11
+ from smftools.constants import HMM_DIR, LOGGING_DIR
12
+ from smftools.logging_utils import get_logger, setup_logging
13
+ from smftools.optional_imports import require
12
14
 
13
15
  # FIX: import _to_dense_np to avoid NameError
14
16
  from ..hmm.HMM import _safe_int_coords, _to_dense_np, create_hmm, normalize_hmm_feature_sets
15
17
 
16
18
  logger = get_logger(__name__)
17
19
 
20
+ if TYPE_CHECKING:
21
+ import torch as torch_types
22
+
23
+ torch = require("torch", extra="torch", purpose="HMM CLI")
24
+ mpl = require("matplotlib", extra="plotting", purpose="HMM plotting")
25
+ mpl_colors = require("matplotlib.colors", extra="plotting", purpose="HMM plotting")
26
+
18
27
  # =============================================================================
19
28
  # Helpers: extracting training arrays
20
29
  # =============================================================================
21
30
 
22
31
 
32
+ def _strip_hmm_layer_prefix(layer: str) -> str:
33
+ """Strip methbase prefixes and length suffixes from an HMM layer name.
34
+
35
+ Args:
36
+ layer: Full layer name (e.g., "GpC_small_accessible_patch_lengths").
37
+
38
+ Returns:
39
+ The base layer name without methbase prefixes or length suffixes.
40
+ """
41
+ base = layer
42
+ for prefix in ("Combined_", "GpC_", "CpG_", "C_", "A_"):
43
+ if base.startswith(prefix):
44
+ base = base[len(prefix) :]
45
+ break
46
+ if base.endswith("_lengths"):
47
+ base = base[: -len("_lengths")]
48
+ if base.endswith("_merged"):
49
+ base = base[: -len("_merged")]
50
+ return base
51
+
52
+
53
+ def _resolve_feature_colormap(layer: str, cfg, default_cmap: str) -> Any:
54
+ """Resolve a colormap for a given HMM layer.
55
+
56
+ Args:
57
+ layer: Full layer name.
58
+ cfg: Experiment config.
59
+ default_cmap: Fallback colormap name.
60
+
61
+ Returns:
62
+ A matplotlib colormap or colormap name.
63
+ """
64
+ feature_maps = getattr(cfg, "hmm_feature_colormaps", {}) or {}
65
+ if not isinstance(feature_maps, dict):
66
+ feature_maps = {}
67
+
68
+ base = _strip_hmm_layer_prefix(layer)
69
+ value = feature_maps.get(layer, feature_maps.get(base))
70
+ if value is None:
71
+ return default_cmap
72
+
73
+ if isinstance(value, (list, tuple)):
74
+ return mpl_colors.ListedColormap(list(value))
75
+
76
+ if isinstance(value, str):
77
+ try:
78
+ mpl.colormaps.get_cmap(value)
79
+ return value
80
+ except Exception:
81
+ return mpl_colors.LinearSegmentedColormap.from_list(
82
+ f"hmm_{base}_cmap", ["#ffffff", value]
83
+ )
84
+
85
+ return default_cmap
86
+
87
+
88
+ def _resolve_feature_color(layer: str, cfg, fallback_cmap: str, idx: int, total: int) -> Any:
89
+ """Resolve a line color for a given HMM layer."""
90
+ feature_maps = getattr(cfg, "hmm_feature_colormaps", {}) or {}
91
+ if not isinstance(feature_maps, dict):
92
+ feature_maps = {}
93
+
94
+ base = _strip_hmm_layer_prefix(layer)
95
+ value = feature_maps.get(layer, feature_maps.get(base))
96
+ if isinstance(value, str):
97
+ try:
98
+ mpl.colormaps.get_cmap(value)
99
+ except Exception:
100
+ return value
101
+ return mpl.colormaps.get_cmap(value)(0.75)
102
+ if isinstance(value, (list, tuple)) and value:
103
+ return value[-1]
104
+
105
+ cmap_obj = mpl.colormaps.get_cmap(fallback_cmap)
106
+ if total <= 1:
107
+ return cmap_obj(0.5)
108
+ return cmap_obj(idx / (total - 1))
109
+
110
+
111
+ def _resolve_length_feature_ranges(
112
+ layer: str, cfg, default_cmap: str
113
+ ) -> List[Tuple[int, int, Any]]:
114
+ """Resolve length-based feature ranges to colors for size contour overlays."""
115
+ base = _strip_hmm_layer_prefix(layer)
116
+ feature_sets = getattr(cfg, "hmm_feature_sets", {}) or {}
117
+ if not isinstance(feature_sets, dict):
118
+ return []
119
+
120
+ feature_key = None
121
+ if "accessible" in base:
122
+ feature_key = "accessible"
123
+ elif "footprint" in base:
124
+ feature_key = "footprint"
125
+
126
+ if feature_key is None:
127
+ return []
128
+
129
+ features = feature_sets.get(feature_key, {}).get("features", {})
130
+ if not isinstance(features, dict):
131
+ return []
132
+
133
+ ranges: List[Tuple[int, int, Any]] = []
134
+ for feature_name, bounds in features.items():
135
+ if not isinstance(bounds, (list, tuple)) or len(bounds) != 2:
136
+ continue
137
+ min_len, max_len = bounds
138
+ if max_len is None or (isinstance(max_len, (float, int)) and np.isinf(max_len)):
139
+ max_len = int(1e9)
140
+ try:
141
+ min_len_int = int(min_len)
142
+ max_len_int = int(max_len)
143
+ except (TypeError, ValueError):
144
+ continue
145
+ color = _resolve_feature_color(feature_name, cfg, default_cmap, 0, 1)
146
+ ranges.append((min_len_int, max_len_int, color))
147
+ return ranges
148
+
149
+
23
150
  def _get_training_matrix(
24
151
  subset, cols_mask: np.ndarray, smf_modality: Optional[str], cfg
25
152
  ) -> Tuple[np.ndarray, Optional[str]]:
@@ -440,31 +567,25 @@ def hmm_adata(config_path: str):
440
567
  - Call hmm_adata_core(cfg, adata, paths)
441
568
  """
442
569
  from ..readwrite import safe_read_h5ad
443
- from .helpers import get_adata_paths
444
- from .load_adata import load_adata
445
- from .preprocess_adata import preprocess_adata
446
- from .spatial_adata import spatial_adata
570
+ from .helpers import get_adata_paths, load_experiment_config
447
571
 
448
572
  # 1) load cfg / stage paths
449
- _, _, cfg = load_adata(config_path)
450
- paths = get_adata_paths(cfg)
573
+ cfg = load_experiment_config(config_path)
451
574
 
452
- # 2) make sure upstream stages are run (they have their own skipping logic)
453
- preprocess_adata(config_path)
454
- spatial_ad, spatial_path = spatial_adata(config_path)
575
+ paths = get_adata_paths(cfg)
455
576
 
456
- # 3) choose starting AnnData
577
+ # 2) choose starting AnnData
457
578
  # Prefer:
458
579
  # - existing HMM h5ad if not forcing redo
459
580
  # - in-memory spatial_ad from wrapper call
460
581
  # - saved spatial / pp_dedup / pp / raw on disk
461
582
  if paths.hmm.exists() and not (cfg.force_redo_hmm_fit or cfg.force_redo_hmm_apply):
462
- adata, _ = safe_read_h5ad(paths.hmm)
463
- return adata, paths.hmm
583
+ logger.debug(f"Skipping hmm. HMM AnnData found: {paths.hmm}")
584
+ return None
464
585
 
465
- if spatial_ad is not None:
466
- adata = spatial_ad
467
- source_path = spatial_path
586
+ if paths.hmm.exists():
587
+ adata, _ = safe_read_h5ad(paths.hmm)
588
+ source_path = paths.hmm
468
589
  elif paths.spatial.exists():
469
590
  adata, _ = safe_read_h5ad(paths.spatial)
470
591
  source_path = paths.spatial
@@ -511,11 +632,14 @@ def hmm_adata_core(
511
632
  Does NOT decide which h5ad to start from – that is the wrapper's job.
512
633
  """
513
634
 
635
+ from datetime import datetime
636
+
514
637
  import numpy as np
515
638
 
516
639
  from ..hmm import call_hmm_peaks
517
640
  from ..metadata import record_smftools_metadata
518
641
  from ..plotting import (
642
+ combined_hmm_length_clustermap,
519
643
  combined_hmm_raw_clustermap,
520
644
  plot_hmm_layers_rolling_by_sample_ref,
521
645
  plot_hmm_size_contours,
@@ -523,18 +647,33 @@ def hmm_adata_core(
523
647
  from ..readwrite import make_dirs
524
648
  from .helpers import write_gz_h5ad
525
649
 
650
+ date_str = datetime.today().strftime("%y%m%d")
651
+ now = datetime.now()
652
+ time_str = now.strftime("%H%M%S")
653
+
654
+ log_level = getattr(logging, cfg.log_level.upper(), logging.INFO)
655
+
526
656
  smf_modality = cfg.smf_modality
527
657
  deaminase = smf_modality == "deaminase"
528
658
 
529
659
  output_directory = Path(cfg.output_directory)
530
- make_dirs([output_directory])
660
+ hmm_directory = output_directory / HMM_DIR
661
+ logging_directory = hmm_directory / LOGGING_DIR
662
+
663
+ make_dirs([output_directory, hmm_directory])
664
+
665
+ if cfg.emit_log_file:
666
+ log_file = logging_directory / f"{date_str}_{time_str}_log.log"
667
+ make_dirs([logging_directory])
668
+ else:
669
+ log_file = None
531
670
 
532
- pp_dir = output_directory / "preprocessed" / "deduplicated"
671
+ setup_logging(level=log_level, log_file=log_file, reconfigure=log_file is not None)
533
672
 
534
673
  # ---------------------------- HMM annotate stage ----------------------------
535
674
  if not (cfg.bypass_hmm_fit and cfg.bypass_hmm_apply):
536
- hmm_models_dir = pp_dir / "10_hmm_models"
537
- make_dirs([pp_dir, hmm_models_dir])
675
+ hmm_models_dir = hmm_directory / "10_hmm_models"
676
+ make_dirs([hmm_directory, hmm_models_dir])
538
677
 
539
678
  # Standard bookkeeping
540
679
  uns_key = "hmm_appended_layers"
@@ -738,6 +877,8 @@ def hmm_adata_core(
738
877
  uns_key=uns_key,
739
878
  uns_flag="hmm_annotated_combined",
740
879
  force_redo=force_apply,
880
+ mask_to_read_span=True,
881
+ mask_use_original_var_names=True,
741
882
  )
742
883
 
743
884
  for core_layer, dist in (
@@ -850,11 +991,11 @@ def hmm_adata_core(
850
991
  logger.info(f"HMM appended layers: {hmm_layers}")
851
992
 
852
993
  # ---------------------------- HMM peak calling stage ----------------------------
853
- hmm_dir = pp_dir / "11_hmm_peak_calling"
994
+ hmm_dir = hmm_directory / "11_hmm_peak_calling"
854
995
  if hmm_dir.is_dir():
855
996
  pass
856
997
  else:
857
- make_dirs([pp_dir, hmm_dir])
998
+ make_dirs([hmm_directory, hmm_dir])
858
999
 
859
1000
  call_hmm_peaks(
860
1001
  adata,
@@ -883,8 +1024,8 @@ def hmm_adata_core(
883
1024
 
884
1025
  ############################################### HMM based feature plotting ###############################################
885
1026
 
886
- hmm_dir = pp_dir / "12_hmm_clustermaps"
887
- make_dirs([pp_dir, hmm_dir])
1027
+ hmm_dir = hmm_directory / "12_hmm_clustermaps"
1028
+ make_dirs([hmm_directory, hmm_dir])
888
1029
 
889
1030
  layers: list[str] = []
890
1031
 
@@ -909,6 +1050,7 @@ def hmm_adata_core(
909
1050
  pass
910
1051
  else:
911
1052
  make_dirs([hmm_cluster_save_dir])
1053
+ hmm_cmap = _resolve_feature_colormap(layer, cfg, cfg.clustermap_cmap_hmm)
912
1054
 
913
1055
  combined_hmm_raw_clustermap(
914
1056
  adata,
@@ -919,7 +1061,7 @@ def hmm_adata_core(
919
1061
  layer_cpg=cfg.layer_for_clustermap_plotting,
920
1062
  layer_c=cfg.layer_for_clustermap_plotting,
921
1063
  layer_a=cfg.layer_for_clustermap_plotting,
922
- cmap_hmm=cfg.clustermap_cmap_hmm,
1064
+ cmap_hmm=hmm_cmap,
923
1065
  cmap_gpc=cfg.clustermap_cmap_gpc,
924
1066
  cmap_cpg=cfg.clustermap_cmap_cpg,
925
1067
  cmap_c=cfg.clustermap_cmap_c,
@@ -930,7 +1072,7 @@ def hmm_adata_core(
930
1072
  0
931
1073
  ],
932
1074
  min_position_valid_fraction=1 - cfg.position_max_nan_threshold,
933
- demux_types=("double", "already"),
1075
+ demux_types=cfg.clustermap_demux_types_to_plot,
934
1076
  save_path=hmm_cluster_save_dir,
935
1077
  normalize_hmm=False,
936
1078
  sort_by=cfg.hmm_clustermap_sortby, # options: 'gpc', 'cpg', 'gpc_cpg', 'none', or 'obs:<column>'
@@ -940,12 +1082,68 @@ def hmm_adata_core(
940
1082
  index_col_suffix=cfg.reindexed_var_suffix,
941
1083
  )
942
1084
 
943
- hmm_dir = pp_dir / "13_hmm_bulk_traces"
1085
+ hmm_length_dir = hmm_directory / "12b_hmm_length_clustermaps"
1086
+ make_dirs([hmm_directory, hmm_length_dir])
1087
+
1088
+ length_layers: list[str] = []
1089
+ length_layer_roots = list(
1090
+ getattr(cfg, "hmm_clustermap_length_layers", cfg.hmm_clustermap_feature_layers)
1091
+ )
1092
+
1093
+ for base in cfg.hmm_methbases:
1094
+ length_layers.extend([f"{base}_{layer}_lengths" for layer in length_layer_roots])
1095
+
1096
+ if getattr(cfg, "hmm_run_multichannel", True) and len(cfg.hmm_methbases) >= 2:
1097
+ length_layers.extend([f"Combined_{layer}_lengths" for layer in length_layer_roots])
1098
+
1099
+ if cfg.cpg:
1100
+ length_layers.extend(["CpG_cpg_patch_lengths"])
1101
+
1102
+ for layer in length_layers:
1103
+ hmm_cluster_save_dir = hmm_length_dir / layer
1104
+ if hmm_cluster_save_dir.is_dir():
1105
+ pass
1106
+ else:
1107
+ make_dirs([hmm_cluster_save_dir])
1108
+ length_cmap = _resolve_feature_colormap(layer, cfg, "Greens")
1109
+ length_feature_ranges = _resolve_length_feature_ranges(layer, cfg, "Greens")
1110
+
1111
+ combined_hmm_length_clustermap(
1112
+ adata,
1113
+ sample_col=cfg.sample_name_col_for_plotting,
1114
+ reference_col=cfg.reference_column,
1115
+ length_layer=layer,
1116
+ layer_gpc=cfg.layer_for_clustermap_plotting,
1117
+ layer_cpg=cfg.layer_for_clustermap_plotting,
1118
+ layer_c=cfg.layer_for_clustermap_plotting,
1119
+ layer_a=cfg.layer_for_clustermap_plotting,
1120
+ cmap_lengths=length_cmap,
1121
+ cmap_gpc=cfg.clustermap_cmap_gpc,
1122
+ cmap_cpg=cfg.clustermap_cmap_cpg,
1123
+ cmap_c=cfg.clustermap_cmap_c,
1124
+ cmap_a=cfg.clustermap_cmap_a,
1125
+ min_quality=cfg.read_quality_filter_thresholds[0],
1126
+ min_length=cfg.read_len_filter_thresholds[0],
1127
+ min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[
1128
+ 0
1129
+ ],
1130
+ min_position_valid_fraction=1 - cfg.position_max_nan_threshold,
1131
+ demux_types=cfg.clustermap_demux_types_to_plot,
1132
+ save_path=hmm_cluster_save_dir,
1133
+ sort_by=cfg.hmm_clustermap_sortby,
1134
+ bins=None,
1135
+ deaminase=deaminase,
1136
+ min_signal=0,
1137
+ index_col_suffix=cfg.reindexed_var_suffix,
1138
+ length_feature_ranges=length_feature_ranges,
1139
+ )
1140
+
1141
+ hmm_dir = hmm_directory / "13_hmm_bulk_traces"
944
1142
 
945
1143
  if hmm_dir.is_dir():
946
1144
  logger.debug(f"{hmm_dir} already exists.")
947
1145
  else:
948
- make_dirs([pp_dir, hmm_dir])
1146
+ make_dirs([hmm_directory, hmm_dir])
949
1147
  from ..plotting import plot_hmm_layers_rolling_by_sample_ref
950
1148
 
951
1149
  bulk_hmm_layers = [
@@ -953,6 +1151,10 @@ def hmm_adata_core(
953
1151
  for layer in hmm_layers
954
1152
  if not any(s in layer for s in ("_lengths", "_states", "_posterior"))
955
1153
  ]
1154
+ layer_colors = {
1155
+ layer: _resolve_feature_color(layer, cfg, "tab20", idx, len(bulk_hmm_layers))
1156
+ for idx, layer in enumerate(bulk_hmm_layers)
1157
+ }
956
1158
  saved = plot_hmm_layers_rolling_by_sample_ref(
957
1159
  adata,
958
1160
  layers=bulk_hmm_layers,
@@ -964,14 +1166,15 @@ def hmm_adata_core(
964
1166
  output_dir=hmm_dir,
965
1167
  save=True,
966
1168
  show_raw=False,
1169
+ layer_colors=layer_colors,
967
1170
  )
968
1171
 
969
- hmm_dir = pp_dir / "14_hmm_fragment_distributions"
1172
+ hmm_dir = hmm_directory / "14_hmm_fragment_distributions"
970
1173
 
971
1174
  if hmm_dir.is_dir():
972
1175
  logger.debug(f"{hmm_dir} already exists.")
973
1176
  else:
974
- make_dirs([pp_dir, hmm_dir])
1177
+ make_dirs([hmm_directory, hmm_dir])
975
1178
  from ..plotting import plot_hmm_size_contours
976
1179
 
977
1180
  if smf_modality == "deaminase":
@@ -996,6 +1199,8 @@ def hmm_adata_core(
996
1199
  for layer, max in fragments:
997
1200
  save_path = hmm_dir / layer
998
1201
  make_dirs([save_path])
1202
+ layer_cmap = _resolve_feature_colormap(layer, cfg, "Greens")
1203
+ feature_ranges = _resolve_length_feature_ranges(layer, cfg, "Greens")
999
1204
 
1000
1205
  figs = plot_hmm_size_contours(
1001
1206
  adata,
@@ -1011,8 +1216,9 @@ def hmm_adata_core(
1011
1216
  dpi=200,
1012
1217
  smoothing_sigma=(10, 10),
1013
1218
  normalize_after_smoothing=True,
1014
- cmap="Greens",
1219
+ cmap=layer_cmap,
1015
1220
  log_scale_z=True,
1221
+ feature_ranges=tuple(feature_ranges),
1016
1222
  )
1017
1223
  ########################################################################################################################
1018
1224