PyPI - smftools - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl - Mend

smftools 0.2.1py3-none-any.whl → 0.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (114) hide show

smftools/config/experiment_config.py CHANGED Viewed

@@ -6,6 +6,7 @@ import warnings
 from dataclasses import dataclass, field, asdict
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union, IO, Sequence
+from .discover_input_files import discover_input_files
 # Optional dependency for YAML handling
 try:
@@ -213,7 +214,7 @@ def resolve_aligner_args(
     return list(default_by_aligner.get(key_align, []))
-# HMM default params and hepler functions
+# HMM default params and helper functions
 def normalize_hmm_feature_sets(raw: Any) -> Dict[str, dict]:
     """
     Normalize user-provided `hmm_feature_sets` into canonical structure:
@@ -274,6 +275,58 @@ def normalize_hmm_feature_sets(raw: Any) -> Dict[str, dict]:
         canonical[grp] = {"features": feats, "state": state}
     return canonical
+def normalize_peak_feature_configs(raw: Any) -> Dict[str, dict]:
+    """
+    Normalize user-provided `hmm_peak_feature_configs` into:
+      {
+        layer_name: {
+          "min_distance": int,
+          "peak_width": int,
+          "peak_prominence": float,
+          "peak_threshold": float,
+          "rolling_window": int,
+        },
+        ...
+      }
+    Accepts dict, JSON/string, None. Returns {} for empty input.
+    """
+    if raw is None:
+        return {}
+    parsed = raw
+    if isinstance(raw, str):
+        parsed = _try_json_or_literal(raw)
+    if not isinstance(parsed, dict):
+        return {}
+    defaults = {
+        "min_distance": 200,
+        "peak_width": 200,
+        "peak_prominence": 0.2,
+        "peak_threshold": 0.8,
+        "rolling_window": 1,
+    }
+    out: Dict[str, dict] = {}
+    for layer, conf in parsed.items():
+        if conf is None:
+            conf = {}
+        if not isinstance(conf, dict):
+            # allow shorthand like 300 -> interpreted as peak_width
+            conf = {"peak_width": conf}
+        full = defaults.copy()
+        full.update(conf)
+        out[str(layer)] = {
+            "min_distance": int(full["min_distance"]),
+            "peak_width": int(full["peak_width"]),
+            "peak_prominence": float(full["peak_prominence"]),
+            "peak_threshold": float(full["peak_threshold"]),
+            "rolling_window": int(full["rolling_window"]),
+        }
+    return out
 # -------------------------
 # LoadExperimentConfig
@@ -593,7 +646,10 @@ class ExperimentConfig:
     fasta: Optional[str] = None
     bam_suffix: str = ".bam"
     recursive_input_search: bool = True
+    input_type: Optional[str] = None
+    input_files: Optional[List[Path]] = None
     split_dir: str = "demultiplexed_BAMs"
+    split_path: Optional[str] = None
     strands: List[str] = field(default_factory=lambda: ["bottom", "top"])
     conversions: List[str] = field(default_factory=lambda: ["unconverted"])
     fasta_regions_of_interest: Optional[str] = None
@@ -601,11 +657,16 @@ class ExperimentConfig:
     sample_sheet_mapping_column: Optional[str] = 'Barcode'
     experiment_name: Optional[str] = None
     input_already_demuxed: bool = False
+    summary_file: Optional[Path] = None
     # FASTQ input specific
     fastq_barcode_map: Optional[Dict[str, str]] = None
     fastq_auto_pairing: bool = True
+    # Remove intermediate file options
+    delete_intermediate_bams: bool = False
+    delete_intermediate_tsvs: bool = True
     # Conversion/Deamination file handling
     delete_intermediate_hdfs: bool = True
@@ -638,13 +699,16 @@ class ExperimentConfig:
     m5C_threshold: float = 0.7
     hm5C_threshold: float = 0.7
     thresholds: List[float] = field(default_factory=list)
-    mod_list: List[str] = field(default_factory=lambda: ["5mC_5hmC", "6mA"])
+    mod_list: List[str] = field(default_factory=lambda: ["5mC_5hmC", "6mA"]) # Dorado modified basecalling codes
+    mod_map: Dict[str, str] = field(default_factory=lambda: {'6mA': '6mA', '5mC_5hmC': '5mC'}) # Map from dorado modified basecalling codes to codes used in modkit_extract_to_adata function
     # Alignment params
     mapping_threshold: float = 0.01 # Min threshold for fraction of reads in a sample mapping to a reference in order to include the reference in the anndata
-    aligner: str = "minimap2"
+    align_from_bam: bool = False # Whether minimap2 should align from a bam file as input. If False, aligns from FASTQ
+    aligner: str = "dorado"
     aligner_args: Optional[List[str]] = None
     make_bigwigs: bool = False
+    make_beds: bool = False
     # Anndata structure
     reference_column: Optional[str] = 'Reference_strand'
@@ -656,23 +720,40 @@ class ExperimentConfig:
     # Preprocessing - Read length and quality filter params
     read_coord_filter: Optional[Sequence[float]] = field(default_factory=lambda: [None, None])
-    read_len_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [200, None])
-    read_len_to_ref_ratio_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [0.4, 1.1])
-    read_quality_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [20, None])
+    read_len_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [100, None])
+    read_len_to_ref_ratio_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [0.4, 1.5])
+    read_quality_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [15, None])
     read_mapping_quality_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [None, None])
+    # Preprocessing - Optional reindexing params
+    reindexing_offsets: Dict[str, int] = field(default_factory=dict)
+    reindexed_var_suffix: Optional[str] = "reindexed"
+    # Preprocessing - Direct mod detection binarization params
+    fit_position_methylation_thresholds: Optional[bool] = False # Whether to use Youden J-stat to determine position by positions thresholds for modification binarization.
+    binarize_on_fixed_methlyation_threshold: Optional[float] = 0.7 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
+    positive_control_sample_methylation_fitting: Optional[str] = None # A positive control Sample_name to use for fully modified template data
+    negative_control_sample_methylation_fitting: Optional[str] = None # A negative control Sample_name to use for fully unmodified template data
+    infer_on_percentile_sample_methylation_fitting: Optional[int] = 10 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
+    inference_variable_sample_methylation_fitting: Optional[str] = "Raw_modification_signal" # The obs column value used for the percentile metric above.
+    fit_j_threshold: Optional[float] = 0.5 # The J-statistic threhold to use for determining which positions pass qc for mod detection thresholding
+    output_binary_layer_name: Optional[str] = "binarized_methylation"
     # Preprocessing - Read modification filter params
     read_mod_filtering_gpc_thresholds: List[float] = field(default_factory=lambda: [0.025, 0.975])
     read_mod_filtering_cpg_thresholds: List[float] = field(default_factory=lambda: [0.00, 1])
-    read_mod_filtering_any_c_thresholds: List[float] = field(default_factory=lambda: [0.025, 0.975])
+    read_mod_filtering_c_thresholds: List[float] = field(default_factory=lambda: [0.025, 0.975])
     read_mod_filtering_a_thresholds: List[float] = field(default_factory=lambda: [0.025, 0.975])
     read_mod_filtering_use_other_c_as_background: bool = True
     min_valid_fraction_positions_in_read_vs_ref: float = 0.2
+    # Preprocessing - plotting params
+    obs_to_plot_pp_qc: List[str] = field(default_factory=lambda: ['read_length', 'mapped_length','read_quality', 'mapping_quality','mapped_length_to_reference_length_ratio', 'mapped_length_to_read_length_ratio', 'Raw_modification_signal'])
     # Preprocessing - Duplicate detection params
     duplicate_detection_site_types: List[str] = field(default_factory=lambda: ['GpC', 'CpG', 'ambiguous_GpC_CpG'])
     duplicate_detection_distance_threshold: float = 0.07
-    hamming_vs_metric_keys: List[str] = field(default_factory=lambda: ['Fraction_any_C_site_modified'])
+    hamming_vs_metric_keys: List[str] = field(default_factory=lambda: ['Fraction_C_site_modified'])
     duplicate_detection_keep_best_metric: str ='read_quality'
     duplicate_detection_window_size_for_hamming_neighbors: int = 50
     duplicate_detection_min_overlapping_positions: int = 20
@@ -680,22 +761,28 @@ class ExperimentConfig:
     duplicate_detection_hierarchical_linkage: str = "average"
     duplicate_detection_do_pca: bool = False
-    # Preprocessing - Complexity analysis params
+    # Preprocessing - Position QC
+    position_max_nan_threshold: float = 0.1
-    # Basic Analysis - Clustermap params
+    # Spatial Analysis - Clustermap params
     layer_for_clustermap_plotting: Optional[str] = 'nan0_0minus1'
+    clustermap_cmap_c: Optional[str] = 'coolwarm'
+    clustermap_cmap_gpc: Optional[str] = 'coolwarm'
+    clustermap_cmap_cpg: Optional[str] = 'coolwarm'
+    clustermap_cmap_a: Optional[str] = 'coolwarm'
+    spatial_clustermap_sortby: Optional[str] = 'gpc'
-    # Basic Analysis - UMAP/Leiden params
+    # Spatial Analysis - UMAP/Leiden params
     layer_for_umap_plotting: Optional[str] = 'nan_half'
     umap_layers_to_plot: List[str] = field(default_factory=lambda: ["mapped_length", "Raw_modification_signal"])
-    # Basic Analysis - Spatial Autocorrelation params
+    # Spatial Analysis - Spatial Autocorrelation params
     rows_per_qc_autocorr_grid: int = 12
     autocorr_rolling_window_size: int = 25
     autocorr_max_lag: int = 800
-    autocorr_site_types: List[str] = field(default_factory=lambda: ['GpC', 'CpG', 'any_C'])
+    autocorr_site_types: List[str] = field(default_factory=lambda: ['GpC', 'CpG', 'C'])
-    # Basic Analysis - Correlation Matrix params
+    # Spatial Analysis - Correlation Matrix params
     correlation_matrix_types: List[str] = field(default_factory=lambda: ["pearson", "binary_covariance"])
     correlation_matrix_cmaps: List[str] = field(default_factory=lambda: ["seismic", "viridis"])
     correlation_matrix_site_types: List[str] = field(default_factory=lambda: ["GpC_site"])
@@ -717,6 +804,13 @@ class ExperimentConfig:
     cpg: Optional[bool] = False
     hmm_feature_sets: Dict[str, Any] = field(default_factory=dict)
     hmm_merge_layer_features: Optional[List[Tuple]] = field(default_factory=lambda: [(None, 80)])
+    clustermap_cmap_hmm: Optional[str] = 'coolwarm'
+    hmm_clustermap_feature_layers: List[str] = field(default_factory=lambda: ["all_accessible_features"])
+    hmm_clustermap_sortby: Optional[str] = 'hmm'
+    hmm_peak_feature_configs: Dict[str, Any] = field(default_factory=dict)
+    # Pipeline control flow - load adata
+    force_redo_load_adata: bool = False
     # Pipeline control flow - preprocessing and QC
     force_redo_preprocessing: bool = False
@@ -739,8 +833,8 @@ class ExperimentConfig:
     bypass_complexity_analysis: bool = False
     force_redo_complexity_analysis: bool = False
-    # Pipeline control flow - Basic Analyses
-    force_redo_basic_analyses: bool = False
+    # Pipeline control flow - Spatial Analyses
+    force_redo_spatial_analyses: bool = False
     bypass_basic_clustermaps: bool = False
     force_redo_basic_clustermaps: bool = False
     bypass_basic_umap: bool = False
@@ -860,6 +954,70 @@ class ExperimentConfig:
         if merged.get("experiment_name") is None and date_str:
             merged["experiment_name"] = f"{date_str}_SMF_experiment"
+        # Input file types and path handling
+        input_data_path = Path(merged['input_data_path'])
+        # Detect the input filetype
+        if input_data_path.is_file():
+                suffix = input_data_path.suffix.lower()
+                suffixes = [s.lower() for s in input_data_path.suffixes]  # handles multi-part extensions
+                # recognize multi-suffix cases like .fastq.gz or .fq.gz
+                if any(s in ['.pod5', '.p5'] for s in suffixes):
+                    input_type = "pod5"
+                    input_files = [Path(input_data_path)]
+                elif any(s in ['.fast5', '.f5'] for s in suffixes):
+                    input_type = "fast5"
+                    input_files = [Path(input_data_path)]
+                elif any(s in ['.fastq', '.fq'] for s in suffixes):
+                    input_type = "fastq"
+                    input_files = [Path(input_data_path)]
+                elif any(s in ['.bam'] for s in suffixes):
+                    input_type = "bam"
+                    input_files = [Path(input_data_path)]
+                elif any(s in ['.h5ad', ".h5"] for s in suffixes):
+                    input_type = "h5ad"
+                    input_files = [Path(input_data_path)]
+                else:
+                    print("Error detecting input file type")
+        elif input_data_path.is_dir():
+            found = discover_input_files(input_data_path, bam_suffix=merged["bam_suffix"], recursive=merged["recursive_input_search"])
+            if found["input_is_pod5"]:
+                input_type = "pod5"
+                input_files = found["pod5_paths"]
+            elif found["input_is_fast5"]:
+                input_type = "fast5"
+                input_files = found["fast5_paths"]
+            elif found["input_is_fastq"]:
+                input_type = "fastq"
+                input_files = found["fastq_paths"]
+            elif found["input_is_bam"]:
+                input_type = "bam"
+                input_files = found["bam_paths"]
+            elif found["input_is_h5ad"]:
+                input_type = "h5ad"
+                input_files = found["h5ad_paths"]
+            print(
+                f"Found {found['all_files_searched']} files; "
+                f"fastq={len(found['fastq_paths'])}, "
+                f"bam={len(found['bam_paths'])}, "
+                f"pod5={len(found['pod5_paths'])}, "
+                f"fast5={len(found['fast5_paths'])}, "
+                f"h5ad={len(found['h5ad_paths'])}"
+            )
+        # summary file output path
+        output_dir = Path(merged['output_directory'])
+        summary_file_basename = merged["experiment_name"] + '_output_summary.csv'
+        summary_file = output_dir / summary_file_basename
+        # Demultiplexing output path
+        split_dir = merged.get("split_dir", "demultiplexed_BAMs")
+        split_path = output_dir / split_dir
         # final normalization
         if "strands" in merged:
             merged["strands"] = _parse_list(merged["strands"])
@@ -900,6 +1058,9 @@ class ExperimentConfig:
         if "mod_list" in merged:
             merged["mod_list"] = _parse_list(merged.get("mod_list"))
+        # Preprocessing args
+        obs_to_plot_pp_qc = _parse_list(merged.get("obs_to_plot_pp_qc", None))
         # HMM feature set handling
         if "hmm_feature_sets" in merged:
             merged["hmm_feature_sets"] = normalize_hmm_feature_sets(merged["hmm_feature_sets"])
@@ -935,14 +1096,23 @@ class ExperimentConfig:
             hmm_methbases = ['C']
         hmm_methbases = list(hmm_methbases)
         hmm_merge_layer_features = _parse_list(merged.get("hmm_merge_layer_features", None))
+        hmm_clustermap_feature_layers = _parse_list(merged.get("hmm_clustermap_feature_layers", "all_accessible_features"))
+        # HMM peak feature configs (for call_hmm_peaks)
+        merged["hmm_peak_feature_configs"] = normalize_peak_feature_configs(
+            merged.get("hmm_peak_feature_configs", {})
+        )
+        hmm_peak_feature_configs = merged.get("hmm_peak_feature_configs", {})
         # instantiate dataclass
         instance = cls(
             smf_modality = merged.get("smf_modality"),
-            input_data_path = merged.get("input_data_path"),
+            input_data_path = input_data_path,
             recursive_input_search = merged.get("recursive_input_search"),
-            output_directory = merged.get("output_directory"),
+            input_type = input_type,
+            input_files = input_files,
+            output_directory = output_dir,
+            summary_file = summary_file,
             fasta = merged.get("fasta"),
             sequencer = merged.get("sequencer"),
             model_dir = merged.get("model_dir"),
@@ -950,7 +1120,8 @@ class ExperimentConfig:
             fastq_barcode_map = merged.get("fastq_barcode_map"),
             fastq_auto_pairing = merged.get("fastq_auto_pairing"),
             bam_suffix = merged.get("bam_suffix", ".bam"),
-            split_dir = merged.get("split_dir", "demultiplexed_BAMs"),
+            split_dir = split_dir,
+            split_path = split_path,
             strands = merged.get("strands", ["bottom","top"]),
             conversions = merged.get("conversions", ["unconverted"]),
             fasta_regions_of_interest = merged.get("fasta_regions_of_interest"),
@@ -963,14 +1134,18 @@ class ExperimentConfig:
             threads = merged.get("threads"),
             sample_sheet_path = merged.get("sample_sheet_path"),
             sample_sheet_mapping_column = merged.get("sample_sheet_mapping_column"),
+            delete_intermediate_bams = merged.get("delete_intermediate_bams", False),
+            delete_intermediate_tsvs = merged.get("delete_intermediate_tsvs", True),
+            align_from_bam = merged.get("align_from_bam", False),
             aligner = merged.get("aligner", "minimap2"),
             aligner_args = merged.get("aligner_args", None),
             device = merged.get("device", "auto"),
             make_bigwigs = merged.get("make_bigwigs", False),
+            make_beds = merged.get("make_beds", False),
             delete_intermediate_hdfs = merged.get("delete_intermediate_hdfs", True),
             mod_target_bases = merged.get("mod_target_bases", ["GpC","CpG"]),
             enzyme_target_bases = merged.get("enzyme_target_bases", ["GpC"]),
-            conversion_types = merged.get("conversion_types", ["5mC"]),
+            conversion_types = merged.get("conversions", ["unconverted"]) + merged.get("conversion_types", ["5mC"]),
             filter_threshold = merged.get("filter_threshold", 0.8),
             m6A_threshold = merged.get("m6A_threshold", 0.7),
             m5C_threshold = merged.get("m5C_threshold", 0.7),
@@ -983,14 +1158,30 @@ class ExperimentConfig:
             reference_column = merged.get("reference_column", 'Reference_strand'),
             sample_column = merged.get("sample_column", 'Barcode'),
             sample_name_col_for_plotting = merged.get("sample_name_col_for_plotting", 'Barcode'),
+            obs_to_plot_pp_qc = obs_to_plot_pp_qc,
+            fit_position_methylation_thresholds = merged.get("fit_position_methylation_thresholds", False),
+            binarize_on_fixed_methlyation_threshold = merged.get("binarize_on_fixed_methlyation_threshold", 0.7),
+            positive_control_sample_methylation_fitting = merged.get("positive_control_sample_methylation_fitting", None),
+            negative_control_sample_methylation_fitting = merged.get("negative_control_sample_methylation_fitting", None),
+            infer_on_percentile_sample_methylation_fitting = merged.get("infer_on_percentile_sample_methylation_fitting", 10),
+            inference_variable_sample_methylation_fitting = merged.get("inference_variable_sample_methylation_fitting", "Raw_modification_signal"),
+            fit_j_threshold = merged.get("fit_j_threshold", 0.5),
+            output_binary_layer_name = merged.get("output_binary_layer_name", "binarized_methylation"),
+            reindexing_offsets = merged.get("reindexing_offsets", {None: None}),
+            reindexed_var_suffix = merged.get("reindexed_var_suffix", "reindexed"),
             layer_for_clustermap_plotting = merged.get("layer_for_clustermap_plotting", 'nan0_0minus1'),
+            clustermap_cmap_c = merged.get("clustermap_cmap_c", 'coolwarm'),
+            clustermap_cmap_gpc = merged.get("clustermap_cmap_gpc", 'coolwarm'),
+            clustermap_cmap_cpg = merged.get("clustermap_cmap_cpg", 'coolwarm'),
+            clustermap_cmap_a = merged.get("clustermap_cmap_a", 'coolwarm'),
+            spatial_clustermap_sortby = merged.get("spatial_clustermap_sortby", 'gpc'),
             layer_for_umap_plotting = merged.get("layer_for_umap_plotting", 'nan_half'),
             umap_layers_to_plot = merged.get("umap_layers_to_plot",["mapped_length", 'Raw_modification_signal']),
             rows_per_qc_histogram_grid = merged.get("rows_per_qc_histogram_grid", 12),
             rows_per_qc_autocorr_grid = merged.get("rows_per_qc_autocorr_grid", 12),
             autocorr_rolling_window_size = merged.get("autocorr_rolling_window_size", 25),
             autocorr_max_lag = merged.get("autocorr_max_lag", 800),
-            autocorr_site_types = merged.get("autocorr_site_types", ['GpC', 'CpG', 'any_C']),
+            autocorr_site_types = merged.get("autocorr_site_types", ['GpC', 'CpG', 'C']),
             hmm_n_states = merged.get("hmm_n_states", 2),
             hmm_init_emission_probs = merged.get("hmm_init_emission_probs",[[0.8, 0.2], [0.2, 0.8]]),
             hmm_init_transition_probs = merged.get("hmm_init_transition_probs",[[0.9, 0.1], [0.1, 0.9]]),
@@ -1004,17 +1195,21 @@ class ExperimentConfig:
             hmm_methbases = hmm_methbases,
             hmm_device = hmm_device,
             hmm_merge_layer_features = hmm_merge_layer_features,
+            clustermap_cmap_hmm = merged.get("clustermap_cmap_hmm", 'coolwarm'),
+            hmm_clustermap_feature_layers = hmm_clustermap_feature_layers,
+            hmm_clustermap_sortby = merged.get("hmm_clustermap_sortby", 'hmm'),
+            hmm_peak_feature_configs = hmm_peak_feature_configs,
             footprints = merged.get("footprints", None),
             accessible_patches = merged.get("accessible_patches", None),
             cpg = merged.get("cpg", None),
             read_coord_filter = merged.get("read_coord_filter", [None, None]),
-            read_len_filter_thresholds = merged.get("read_len_filter_thresholds", [200, None]),
-            read_len_to_ref_ratio_filter_thresholds = merged.get("read_len_to_ref_ratio_filter_thresholds", [0.4, 1.1]),
-            read_quality_filter_thresholds = merged.get("read_quality_filter_thresholds", [20, None]),
+            read_len_filter_thresholds = merged.get("read_len_filter_thresholds", [100, None]),
+            read_len_to_ref_ratio_filter_thresholds = merged.get("read_len_to_ref_ratio_filter_thresholds", [0.3, None]),
+            read_quality_filter_thresholds = merged.get("read_quality_filter_thresholds", [15, None]),
             read_mapping_quality_filter_thresholds = merged.get("read_mapping_quality_filter_thresholds", [None, None]),
             read_mod_filtering_gpc_thresholds = merged.get("read_mod_filtering_gpc_thresholds", [0.025, 0.975]),
             read_mod_filtering_cpg_thresholds = merged.get("read_mod_filtering_cpg_thresholds", [0.0, 1.0]),
-            read_mod_filtering_any_c_thresholds = merged.get("read_mod_filtering_any_c_thresholds", [0.025, 0.975]),
+            read_mod_filtering_c_thresholds = merged.get("read_mod_filtering_c_thresholds", [0.025, 0.975]),
             read_mod_filtering_a_thresholds = merged.get("read_mod_filtering_a_thresholds", [0.025, 0.975]),
             read_mod_filtering_use_other_c_as_background = merged.get("read_mod_filtering_use_other_c_as_background", True),
             min_valid_fraction_positions_in_read_vs_ref = merged.get("min_valid_fraction_positions_in_read_vs_ref", 0.2),
@@ -1026,10 +1221,12 @@ class ExperimentConfig:
             duplicate_detection_do_hierarchical = merged.get("duplicate_detection_do_hierarchical", True),
             duplicate_detection_hierarchical_linkage = merged.get("duplicate_detection_hierarchical_linkage", "average"),
             duplicate_detection_do_pca = merged.get("duplicate_detection_do_pca", False),
+            position_max_nan_threshold = merged.get("position_max_nan_threshold", 0.1),
             correlation_matrix_types = merged.get("correlation_matrix_types", ["pearson", "binary_covariance"]),
             correlation_matrix_cmaps = merged.get("correlation_matrix_cmaps", ["seismic", "viridis"]),
             correlation_matrix_site_types = merged.get("correlation_matrix_site_types", ["GpC_site"]),
-            hamming_vs_metric_keys = merged.get("hamming_vs_metric_keys", ['Fraction_any_C_site_modified']),
+            hamming_vs_metric_keys = merged.get("hamming_vs_metric_keys", ['Fraction_C_site_modified']),
+            force_redo_load_adata = merged.get("force_redo_load_adata", False),
             force_redo_preprocessing = merged.get("force_redo_preprocessing", False),
             force_reload_sample_sheet = merged.get("force_reload_sample_sheet", True),
             bypass_add_read_length_and_mapping_qc = merged.get("bypass_add_read_length_and_mapping_qc", False),
@@ -1049,7 +1246,7 @@ class ExperimentConfig:
             force_redo_flag_duplicate_reads = merged.get("force_redo_flag_duplicate_reads", False),
             bypass_complexity_analysis = merged.get("bypass_complexity_analysis", False),
             force_redo_complexity_analysis = merged.get("force_redo_complexity_analysis", False),
-            force_redo_basic_analyses = merged.get("force_redo_basic_analyses", False),
+            force_redo_spatial_analyses = merged.get("force_redo_spatial_analyses", False),
             bypass_basic_clustermaps = merged.get("bypass_basic_clustermaps", False),
             force_redo_basic_clustermaps = merged.get("force_redo_basic_clustermaps", False),
             bypass_basic_umap = merged.get("bypass_basic_umap", False),
@@ -1101,6 +1298,7 @@ class ExperimentConfig:
     # -------------------------
     # validation & serialization
     # -------------------------
+    @staticmethod
     def _validate_hmm_features_structure(hfs: dict) -> List[str]:
         errs = []
         if not isinstance(hfs, dict):

smftools/hmm/HMM.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import math
-from typing import List, Optional, Tuple, Union, Any, Dict
+from typing import List, Optional, Tuple, Union, Any, Dict, Sequence
 import ast
 import json
@@ -772,6 +772,8 @@ class HMM(nn.Module):
         verbose: bool = True,
         uns_key: str = "hmm_appended_layers",
         config: Optional[Union[dict, "ExperimentConfig"]] = None,  # NEW: config/dict accepted
+        uns_flag: str = "hmm_annotated",
+        force_redo: bool = False
     ):
         """
         Annotate an AnnData with HMM-derived features (in adata.obs and adata.layers).
@@ -793,6 +795,12 @@ class HMM(nn.Module):
         import torch as _torch
         from tqdm import trange, tqdm as _tqdm
+        # Only run if not already performed
+        already = bool(adata.uns.get(uns_flag, False))
+        if (already and not force_redo):
+            # QC already performed; nothing to do
+            return None if in_place else adata
         # small helpers
         def _try_json_or_literal(s):
             if s is None:
@@ -1298,6 +1306,9 @@ class HMM(nn.Module):
         new_list = existing + [l for l in appended_layers if l not in existing]
         adata.uns[uns_key] = new_list
+        # Mark that the annotation has been completed
+        adata.uns[uns_flag] = True
         return None if in_place else adata
     def merge_intervals_in_layer(

smftools/hmm/__init__.py CHANGED Viewed

@@ -1,20 +1,14 @@
-from .apply_hmm_batched import apply_hmm_batched
-from .calculate_distances import calculate_distances
 from .call_hmm_peaks import call_hmm_peaks
 from .display_hmm import display_hmm
 from .hmm_readwrite import load_hmm, save_hmm
 from .nucleosome_hmm_refinement import refine_nucleosome_calls, infer_nucleosomes_in_large_bound
-from .train_hmm import train_hmm
 __all__ = [
-    "apply_hmm_batched",
-    "calculate_distances",
     "call_hmm_peaks",
     "display_hmm",
     "load_hmm",
     "refine_nucleosome_calls",
     "infer_nucleosomes_in_large_bound",
     "save_hmm",
-    "train_hmm"
 ]

smftools/hmm/archived/call_hmm_peaks.py ADDED Viewed

@@ -0,0 +1,106 @@
+def call_hmm_peaks(
+    adata,
+    feature_configs,
+    obs_column='Reference_strand',
+    site_types=['GpC_site', 'CpG_site'],
+    save_plot=False,
+    output_dir=None,
+    date_tag=None,
+    inplace=False
+):
+    import numpy as np
+    import pandas as pd
+    import matplotlib.pyplot as plt
+    from scipy.signal import find_peaks
+    if not inplace:
+        adata = adata.copy()
+    # Ensure obs_column is categorical
+    if not isinstance(adata.obs[obs_column].dtype, pd.CategoricalDtype):
+        adata.obs[obs_column] = pd.Categorical(adata.obs[obs_column])
+    coordinates = adata.var_names.astype(int).values
+    peak_columns = []
+    obs_updates = {}
+    for feature_layer, config in feature_configs.items():
+        min_distance = config.get('min_distance', 200)
+        peak_width = config.get('peak_width', 200)
+        peak_prominence = config.get('peak_prominence', 0.2)
+        peak_threshold = config.get('peak_threshold', 0.8)
+        matrix = adata.layers[feature_layer]
+        means = np.mean(matrix, axis=0)
+        peak_indices, _ = find_peaks(means, prominence=peak_prominence, distance=min_distance)
+        peak_centers = coordinates[peak_indices]
+        adata.uns[f'{feature_layer} peak_centers'] = peak_centers.tolist()
+        # Plot
+        plt.figure(figsize=(6, 3))
+        plt.plot(coordinates, means)
+        plt.title(f"{feature_layer} with peak calls")
+        plt.xlabel("Genomic position")
+        plt.ylabel("Mean intensity")
+        for i, center in enumerate(peak_centers):
+            start, end = center - peak_width // 2, center + peak_width // 2
+            plt.axvspan(start, end, color='purple', alpha=0.2)
+            plt.axvline(center, color='red', linestyle='--')
+            aligned = [end if i % 2 else start, 'left' if i % 2 else 'right']
+            plt.text(aligned[0], 0, f"Peak {i}\n{center}", color='red', ha=aligned[1])
+        if save_plot and output_dir:
+            filename = f"{output_dir}/{date_tag or 'output'}_{feature_layer}_peaks.png"
+            plt.savefig(filename, bbox_inches='tight')
+            print(f"Saved plot to {filename}")
+        else:
+            plt.show()
+        feature_peak_columns = []
+        for center in peak_centers:
+            start, end = center - peak_width // 2, center + peak_width // 2
+            colname = f'{feature_layer}_peak_{center}'
+            peak_columns.append(colname)
+            feature_peak_columns.append(colname)
+            peak_mask = (coordinates >= start) & (coordinates <= end)
+            adata.var[colname] = peak_mask
+            region = matrix[:, peak_mask]
+            obs_updates[f'mean_{feature_layer}_around_{center}'] = np.mean(region, axis=1)
+            obs_updates[f'sum_{feature_layer}_around_{center}'] = np.sum(region, axis=1)
+            obs_updates[f'{feature_layer}_present_at_{center}'] = np.mean(region, axis=1) > peak_threshold
+            for site_type in site_types:
+                adata.obs[f'{site_type}_sum_around_{center}'] = 0
+                adata.obs[f'{site_type}_mean_around_{center}'] = np.nan
+            for ref in adata.obs[obs_column].cat.categories:
+                ref_idx = adata.obs[obs_column] == ref
+                mask_key = f"{ref}_{site_type}"
+                for site_type in site_types:
+                    if mask_key not in adata.var:
+                        continue
+                    site_mask = adata.var[mask_key].values
+                    site_coords = coordinates[site_mask]
+                    region_mask = (site_coords >= start) & (site_coords <= end)
+                    if not region_mask.any():
+                        continue
+                    full_mask = site_mask.copy()
+                    full_mask[site_mask] = region_mask
+                    site_region = adata[ref_idx, full_mask].X
+                    if hasattr(site_region, "A"):
+                        site_region = site_region.A
+                    if site_region.shape[1] > 0:
+                        adata.obs.loc[ref_idx, f'{site_type}_sum_around_{center}'] = np.nansum(site_region, axis=1)
+                        adata.obs.loc[ref_idx, f'{site_type}_mean_around_{center}'] = np.nanmean(site_region, axis=1)
+                    else:
+                        pass
+        adata.var[f'is_in_any_{feature_layer}_peak'] = adata.var[feature_peak_columns].any(axis=1)
+        print(f"Annotated {len(peak_centers)} peaks for {feature_layer}")
+    adata.var['is_in_any_peak'] = adata.var[peak_columns].any(axis=1)
+    adata.obs = pd.concat([adata.obs, pd.DataFrame(obs_updates, index=adata.obs.index)], axis=1)
+    return adata if not inplace else None

smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl

smftools 0.2.1py3-none-any.whl → 0.2.4py3-none-any.whl