PyPI - smftools - Versions diffs - 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl - Mend

smftools 0.2.3py3-none-any.whl → 0.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

smftools/_version.py +1 -1
smftools/cli/helpers.py +48 -0
smftools/cli/hmm_adata.py +168 -145
smftools/cli/load_adata.py +155 -95
smftools/cli/preprocess_adata.py +222 -130
smftools/cli/spatial_adata.py +441 -308
smftools/cli_entry.py +4 -5
smftools/config/conversion.yaml +12 -5
smftools/config/deaminase.yaml +11 -9
smftools/config/default.yaml +123 -19
smftools/config/direct.yaml +3 -0
smftools/config/experiment_config.py +120 -19
smftools/hmm/HMM.py +12 -1
smftools/hmm/__init__.py +0 -6
smftools/hmm/archived/call_hmm_peaks.py +106 -0
smftools/hmm/call_hmm_peaks.py +318 -90
smftools/informatics/bam_functions.py +28 -29
smftools/informatics/h5ad_functions.py +1 -1
smftools/plotting/general_plotting.py +97 -51
smftools/plotting/position_stats.py +3 -3
smftools/preprocessing/__init__.py +2 -4
smftools/preprocessing/append_base_context.py +34 -25
smftools/preprocessing/append_binary_layer_by_base_context.py +2 -2
smftools/preprocessing/binarize_on_Youden.py +10 -8
smftools/preprocessing/calculate_complexity_II.py +1 -1
smftools/preprocessing/calculate_coverage.py +16 -13
smftools/preprocessing/calculate_position_Youden.py +41 -25
smftools/preprocessing/calculate_read_modification_stats.py +1 -1
smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
smftools/preprocessing/filter_reads_on_modification_thresholds.py +1 -1
smftools/preprocessing/flag_duplicate_reads.py +1 -1
smftools/preprocessing/invert_adata.py +1 -1
smftools/preprocessing/load_sample_sheet.py +1 -1
smftools/preprocessing/reindex_references_adata.py +37 -0
smftools/readwrite.py +94 -0
{smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/METADATA +18 -12
{smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/RECORD +46 -43
/smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
/smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
/smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
/smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
/smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
/smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
{smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
{smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/entry_points.txt +0 -0
{smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0

smftools/cli_entry.py CHANGED Viewed

@@ -4,12 +4,11 @@ from pathlib import Path
 from typing import Dict, Optional, Sequence
 from .cli.load_adata import load_adata
-from .cli.cli_flows import flow_I
 from .cli.preprocess_adata import preprocess_adata
 from .cli.spatial_adata import spatial_adata
 from .cli.hmm_adata import hmm_adata
-from .readwrite import merge_barcoded_anndatas_core, safe_read_h5ad, safe_write_h5ad, concatenate_h5ads
+from .readwrite import safe_read_h5ad, safe_write_h5ad, concatenate_h5ads
 @click.group()
 def cli():
@@ -244,9 +243,9 @@ def concatenate_cmd(
     Two modes:
-        smftools concatenate out.h5ad --input-dir ./dir
+        smftools concatenate out.h5ad.gz --input-dir ./dir
-        smftools concatenate out.h5ad --csv-path paths.csv --csv-column h5ad_path
+        smftools concatenate out.h5ad.gz --csv-path paths.csv --csv-column h5ad_path
     TXT input also works (one file path per line).
@@ -266,7 +265,7 @@ def concatenate_cmd(
             delete_inputs=delete,
             restore_backups=restore,
         )
-        click.echo(f"✓ Concatenated file written to: {out}")
+        click.echo(f"Concatenated file written to: {out}")
     except Exception as e:
         raise click.ClickException(str(e)) from e

smftools/config/conversion.yaml CHANGED Viewed

@@ -9,6 +9,13 @@ conversion_types:
 # Read QC Params
 read_mod_filtering_use_other_c_as_background: True
+# Spatial Analysis - Clustermap params
+layer_for_clustermap_plotting: 'nan0_0minus1'
+clustermap_cmap_c: "coolwarm"
+clustermap_cmap_gpc: "coolwarm"
+clustermap_cmap_cpg: "viridis"
+clustermap_cmap_a: "coolwarm"
 ######## smftools hmm params #########
 # HMM
 cpg: True # whether to use the default HMM endogenous CpG patch params
@@ -18,17 +25,17 @@ hmm_feature_sets:
   footprint:
     state: "Non-Modified"
     features:
-      small_bound_stretch: [10, 30]
-      medium_bound_stretch: [30, 110]
-      putative_nucleosome: [110, 200]
+      small_bound_stretch: [6, 40]
+      medium_bound_stretch: [40, 100]
+      putative_nucleosome: [100, 200]
       large_bound_stretch: [200, inf]
   accessible:
     state: "Modified"
     features:
       small_accessible_patch: [3, 20]
       mid_accessible_patch: [20, 40]
-      mid_large_accessible_patch: [40, 130]
-      large_accessible_patch: [130, inf]
+      large_accessible_patch: [40, 110]
+      nucleosome_depleted_region: [110, inf]
   cpg:
     state: "Modified"
     features:

smftools/config/deaminase.yaml CHANGED Viewed

@@ -7,6 +7,8 @@ conversion_types:
 mod_target_bases:
 - "C"
+enzyme_target_bases:
+- "C"
 ######## smftools preprocess params #########
 read_mod_filtering_gpc_thresholds:
@@ -15,7 +17,7 @@ read_mod_filtering_gpc_thresholds:
 read_mod_filtering_cpg_thresholds:
   - null
   - null
-read_mod_filtering_any_c_thresholds:
+read_mod_filtering_c_thresholds:
   - 0.01
   - 0.99
 read_mod_filtering_a_thresholds:
@@ -26,16 +28,16 @@ read_mod_filtering_use_other_c_as_background: False
 # Duplicate Detection Params
 duplicate_detection_site_types:
-  - "any_C"
+  - "C"
 ######## smftools analyze params #########
 # Autocorrelation params
 autocorr_site_types:
-  - "any_C"
+  - "C"
 # Correlation matrix params
 correlation_matrix_site_types:
-  - "any_C_site"
+  - "C_site"
 # ######## smftools hmm params #########
 cpg: False # whether to use the default HMM endogenous CpG patch params
@@ -45,17 +47,17 @@ hmm_feature_sets:
   footprint:
     state: "Non-Modified"
     features:
-      small_bound_stretch: [10, 30]
-      medium_bound_stretch: [30, 110]
-      putative_nucleosome: [110, 200]
+      small_bound_stretch: [6, 40]
+      medium_bound_stretch: [40, 100]
+      putative_nucleosome: [100, 200]
       large_bound_stretch: [200, inf]
   accessible:
     state: "Modified"
     features:
       small_accessible_patch: [3, 20]
       mid_accessible_patch: [20, 40]
-      mid_large_accessible_patch: [40, 130]
-      large_accessible_patch: [130, inf]
+      large_accessible_patch: [40, 110]
+      nucleosome_depleted_region: [110, inf]
 hmm_merge_layer_features:
   - ["C_all_accessible_features", 80]

smftools/config/default.yaml CHANGED Viewed

@@ -21,7 +21,7 @@ fastq_barcode_map: null # For FASTQ files, an optional map of file paths to barc
 fastq_auto_pairing: True # For FASTQ files, attempt to find read pair files automatically.
 input_already_demuxed: False # If the input files are already demultiplexed.
 delete_intermediate_hdfs: True # Whether to delete the intermediate hdfs from the conversion/deamination workflows.
-delete_intermediate_bams: True # Whether to delete intermediate BAM files.
+delete_intermediate_bams: False # Whether to delete intermediate BAM files.
 delete_intermediate_tsvs: True # Whether to delete intermediate TSV files.
 # Sequencing modality and general experiment params
@@ -40,7 +40,8 @@ model: "hac" # needed for dorado basecaller
 filter_threshold: 0.8 # Dorado probability filter threshold for base calling.
 # Alignment params
-aligner: "minimap2" # Aligner to use: dorado, minimap2
+aligner: "dorado" # Aligner to use: dorado, minimap2
+align_from_bam: False # Whether to run alignment from a bam file for minimap2. If False, runs alignment from a FASTQ file.
 aligner_args:
   minimap2:
     ont:
@@ -116,7 +117,7 @@ read_mod_filtering_gpc_thresholds:
 read_mod_filtering_cpg_thresholds:
   - 0.0
   - 1.0
-read_mod_filtering_any_c_thresholds:
+read_mod_filtering_c_thresholds:
   - 0.025
   - 0.975
 read_mod_filtering_a_thresholds:
@@ -125,6 +126,16 @@ read_mod_filtering_a_thresholds:
 read_mod_filtering_use_other_c_as_background: False
 min_valid_fraction_positions_in_read_vs_ref: 0.5
+# Plotting params for read length histograms
+obs_to_plot_pp_qc:
+  - read_length
+  - mapped_length
+  - read_quality
+  - mapping_quality
+  - mapped_length_to_reference_length_ratio
+  - mapped_length_to_read_length_ratio
+  - Raw_modification_signal
 # Duplicate detection params
 duplicate_detection_site_types: # Site types to consider for duplicate detection workflow
   - "GpC"
@@ -132,7 +143,7 @@ duplicate_detection_site_types: # Site types to consider for duplicate detection
   - "ambiguous_GpC_CpG"
 duplicate_detection_distance_threshold: 0.07 # Hamming distance based similarity threshold to use for marking duplicate reads.
 hamming_vs_metric_keys: # Metrics to plot the hamming distance against.
-  - Fraction_any_C_site_modified
+  - Fraction_C_site_modified
 duplicate_detection_keep_best_metric: "read_quality" # Obs metric to use to keep a representative read from a read duplicate cluster
 duplicate_detection_window_size_for_hamming_neighbors: 50 # How many neighboring reads to look at for calculating hamming distance pairs
 duplicate_detection_min_overlapping_positions: 20 # The minimum amount of valid overlapping positions that will allow duplicate detection to work
@@ -143,29 +154,40 @@ duplicate_detection_do_pca: False # Whether to do PCA before hierarchical linkag
 # Position QC params
 position_max_nan_threshold: 0.1 # The maximum amount of nans to tolerate in a column
-######## smftools analyze params #########
-# Basic Analysis - QC Plotting params
+######## smftools spatial params #########
+invert_adata: False # Whether to invert the AnnData along the positions axis.
+# Reindexing params
+reindexing_offsets:
+  null : null
+reindexed_var_suffix: "reindexed"
+# Spatial Analysis - QC Plotting params
 rows_per_qc_histogram_grid: 12
-# Basic Analysis - Clustermap params
+# Spatial Analysis - Clustermap params
 layer_for_clustermap_plotting: 'nan0_0minus1'
+clustermap_cmap_c: "coolwarm"
+clustermap_cmap_gpc: "coolwarm"
+clustermap_cmap_cpg: "coolwarm"
+clustermap_cmap_a: "coolwarm"
+spatial_clustermap_sortby: "gpc"
-# Basic Analysis - UMAP/Leiden params
+# Spatial Analysis - UMAP/Leiden params
 layer_for_umap_plotting: 'nan_half'
 umap_layers_to_plot:
   - "mapped_length"
   - "Raw_modification_signal"
-# Basic Analysis - Spatial Autocorrelation params
+# Spatial Analysis - Spatial Autocorrelation params
 rows_per_qc_autocorr_grid: 6
 autocorr_rolling_window_size: 25
 autocorr_max_lag: 800
 autocorr_site_types:
   - "GpC"
   - "CpG"
-  - "any_C"
+  - "C"
-# Basic Analysis - Correlation Matrix params
+# Spatial Analysis - Correlation Matrix params
 correlation_matrix_types:
   - "pearson"
   - "binary_covariance"
@@ -204,19 +226,102 @@ hmm_feature_sets:
   footprint:
     state: "Non-Modified"
     features:
-      small_bound_stretch: [10, 40]
-      medium_bound_stretch: [40, 110]
-      putative_nucleosome: [110, 200]
+      small_bound_stretch: [6, 40]
+      medium_bound_stretch: [40, 100]
+      putative_nucleosome: [100, 200]
       large_bound_stretch: [200, inf]
   accessible:
     state: "Modified"
     features:
       small_accessible_patch: [3, 20]
       mid_accessible_patch: [20, 40]
-      mid_large_accessible_patch: [40, 110]
-      large_accessible_patch: [110, inf]
+      large_accessible_patch: [40, 110]
+      nucleosome_depleted_region: [110, inf]
 hmm_merge_layer_features:
   - [null, 80]
+clustermap_cmap_hmm: "coolwarm"
+hmm_clustermap_feature_layers:
+  - all_accessible_features
+  - all_accessible_features_merged
+  - small_accessible_patch
+  - mid_accessible_patch
+  - large_accessible_patch
+  - nucleosome_depleted_region
+  - small_bound_stretch
+  - medium_bound_stretch
+  - putative_nucleosome
+  - large_bound_stretch
+hmm_clustermap_sortby: "hmm"
+hmm_peak_feature_configs:
+  all_accessible_features:
+    min_distance: 200 # The minimum distance in between called peaks
+    peak_width: 200 # The window width to calculate sum/mean hmm signal per read centered at the peak center.
+    peak_prominence: 0.1 # The minimum prominence to call a peak
+    peak_threshold: 0.80 # The minimum mean hmm signal in each molecule within the peak window to mark the molecule as positive for the feature.
+    rolling_window: 50 # Window size for the rolling average smoothing before peak calling
+  all_accessible_features_merged:
+    min_distance: 250
+    peak_width: 250
+    peak_prominence: 0.05
+    peak_threshold: 0.80
+    rolling_window: 50
+  small_accessible_patch:
+    min_distance: 40
+    peak_width: 30
+    peak_prominence: 0.1
+    peak_threshold: 0.8
+    rolling_window: 40
+  mid_accessible_patch:
+    min_distance: 100
+    peak_width: 60
+    peak_prominence: 0.025
+    peak_threshold: 0.80
+    rolling_window: 50
+  large_accessible_patch:
+    min_distance: 100
+    peak_width: 100
+    peak_prominence: 0.025
+    peak_threshold: 0.80
+    rolling_window: 50
+  nucleosome_depleted_region:
+    min_distance: 200
+    peak_width: 200
+    peak_prominence: 0.025
+    peak_threshold: 0.80
+    rolling_window: 50
+  small_bound_stretch:
+    min_distance: 20
+    peak_width: 20
+    peak_prominence: 0.01
+    peak_threshold: 0.50
+    rolling_window: 10
+  medium_bound_stretch:
+    min_distance: 40
+    peak_width: 40
+    peak_prominence: 0.01
+    peak_threshold: 0.50
+    rolling_window: 20
+  putative_nucleosome:
+    min_distance: 160
+    peak_width: 147     # canonical nucleosome footprint
+    peak_prominence: 0.025
+    peak_threshold: 0.60
+    rolling_window: 20
+  large_bound_stretch:
+    min_distance: 250
+    peak_width: 300
+    peak_prominence: 0.20
+    peak_threshold: 0.80
+    rolling_window: 50
 # Pipeline control flow - load adata
 force_redo_load_adata: False # Whether to perform load adata command from start
@@ -230,7 +335,6 @@ bypass_clean_nan: False # Whether to skip NaN cleaning
 force_redo_clean_nan: False # Whether to redo NaN cleaning
 bypass_append_base_context: False # Whether to skip adding per reference base context additions.
 force_redo_append_base_context: False # Whether to redo per reference base context additions.
-invert_adata: False # Whether to invert the AnnData along the positions axis.
 bypass_append_binary_layer_by_base_context: False # Whether to skip adding new binary layers for each specific base context.
 force_redo_append_binary_layer_by_base_context: False # Whether to redo adding new binary layers for each specific base context.
 bypass_calculate_read_modification_stats: False # Whether to skip adding read level modification statistics.
@@ -242,8 +346,8 @@ force_redo_flag_duplicate_reads: False # Whether to redo flagging duplicate read
 bypass_complexity_analysis: False # Whether to skip complexity analysis
 force_redo_complexity_analysis: False # Whether to redo complexity analysis
-# Pipeline control flow - Basic Analyses
-force_redo_basic_analyses: False # Whether to force redo the entire basic analysis pipeline from the AnnData
+# Pipeline control flow - Spatial Analyses
+force_redo_spatial_analyses: False # Whether to force redo the entire basic analysis pipeline from the AnnData
 bypass_basic_clustermaps: False # Whether to skip basic clustermap plotting
 force_redo_basic_clustermaps: False # Whether to redo basic clustermap plotting
 bypass_basic_umap: False # Whether to skip basic UMAP calculation/plotting

smftools/config/direct.yaml CHANGED Viewed

@@ -14,6 +14,9 @@ thresholds:
 mod_list:
   - '5mC_5hmC'
   - '6mA' # mods to detect
+mod_map:
+  5mC_5hmC: 5mC
+  6mA: 6mA
 mod_target_bases:
   - "A"
 enzyme_target_bases:

smftools/config/experiment_config.py CHANGED Viewed

@@ -214,7 +214,7 @@ def resolve_aligner_args(
     return list(default_by_aligner.get(key_align, []))
-# HMM default params and hepler functions
+# HMM default params and helper functions
 def normalize_hmm_feature_sets(raw: Any) -> Dict[str, dict]:
     """
     Normalize user-provided `hmm_feature_sets` into canonical structure:
@@ -275,6 +275,58 @@ def normalize_hmm_feature_sets(raw: Any) -> Dict[str, dict]:
         canonical[grp] = {"features": feats, "state": state}
     return canonical
+def normalize_peak_feature_configs(raw: Any) -> Dict[str, dict]:
+    """
+    Normalize user-provided `hmm_peak_feature_configs` into:
+      {
+        layer_name: {
+          "min_distance": int,
+          "peak_width": int,
+          "peak_prominence": float,
+          "peak_threshold": float,
+          "rolling_window": int,
+        },
+        ...
+      }
+    Accepts dict, JSON/string, None. Returns {} for empty input.
+    """
+    if raw is None:
+        return {}
+    parsed = raw
+    if isinstance(raw, str):
+        parsed = _try_json_or_literal(raw)
+    if not isinstance(parsed, dict):
+        return {}
+    defaults = {
+        "min_distance": 200,
+        "peak_width": 200,
+        "peak_prominence": 0.2,
+        "peak_threshold": 0.8,
+        "rolling_window": 1,
+    }
+    out: Dict[str, dict] = {}
+    for layer, conf in parsed.items():
+        if conf is None:
+            conf = {}
+        if not isinstance(conf, dict):
+            # allow shorthand like 300 -> interpreted as peak_width
+            conf = {"peak_width": conf}
+        full = defaults.copy()
+        full.update(conf)
+        out[str(layer)] = {
+            "min_distance": int(full["min_distance"]),
+            "peak_width": int(full["peak_width"]),
+            "peak_prominence": float(full["peak_prominence"]),
+            "peak_threshold": float(full["peak_threshold"]),
+            "rolling_window": int(full["rolling_window"]),
+        }
+    return out
 # -------------------------
 # LoadExperimentConfig
@@ -612,7 +664,7 @@ class ExperimentConfig:
     fastq_auto_pairing: bool = True
     # Remove intermediate file options
-    delete_intermediate_bams: bool = True
+    delete_intermediate_bams: bool = False
     delete_intermediate_tsvs: bool = True
     # Conversion/Deamination file handling
@@ -647,11 +699,13 @@ class ExperimentConfig:
     m5C_threshold: float = 0.7
     hm5C_threshold: float = 0.7
     thresholds: List[float] = field(default_factory=list)
-    mod_list: List[str] = field(default_factory=lambda: ["5mC_5hmC", "6mA"])
+    mod_list: List[str] = field(default_factory=lambda: ["5mC_5hmC", "6mA"]) # Dorado modified basecalling codes
+    mod_map: Dict[str, str] = field(default_factory=lambda: {'6mA': '6mA', '5mC_5hmC': '5mC'}) # Map from dorado modified basecalling codes to codes used in modkit_extract_to_adata function
     # Alignment params
     mapping_threshold: float = 0.01 # Min threshold for fraction of reads in a sample mapping to a reference in order to include the reference in the anndata
-    aligner: str = "minimap2"
+    align_from_bam: bool = False # Whether minimap2 should align from a bam file as input. If False, aligns from FASTQ
+    aligner: str = "dorado"
     aligner_args: Optional[List[str]] = None
     make_bigwigs: bool = False
     make_beds: bool = False
@@ -671,6 +725,10 @@ class ExperimentConfig:
     read_quality_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [15, None])
     read_mapping_quality_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [None, None])
+    # Preprocessing - Optional reindexing params
+    reindexing_offsets: Dict[str, int] = field(default_factory=dict)
+    reindexed_var_suffix: Optional[str] = "reindexed"
     # Preprocessing - Direct mod detection binarization params
     fit_position_methylation_thresholds: Optional[bool] = False # Whether to use Youden J-stat to determine position by positions thresholds for modification binarization.
     binarize_on_fixed_methlyation_threshold: Optional[float] = 0.7 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
@@ -684,15 +742,18 @@ class ExperimentConfig:
     # Preprocessing - Read modification filter params
     read_mod_filtering_gpc_thresholds: List[float] = field(default_factory=lambda: [0.025, 0.975])
     read_mod_filtering_cpg_thresholds: List[float] = field(default_factory=lambda: [0.00, 1])
-    read_mod_filtering_any_c_thresholds: List[float] = field(default_factory=lambda: [0.025, 0.975])
+    read_mod_filtering_c_thresholds: List[float] = field(default_factory=lambda: [0.025, 0.975])
     read_mod_filtering_a_thresholds: List[float] = field(default_factory=lambda: [0.025, 0.975])
     read_mod_filtering_use_other_c_as_background: bool = True
     min_valid_fraction_positions_in_read_vs_ref: float = 0.2
+    # Preprocessing - plotting params
+    obs_to_plot_pp_qc: List[str] = field(default_factory=lambda: ['read_length', 'mapped_length','read_quality', 'mapping_quality','mapped_length_to_reference_length_ratio', 'mapped_length_to_read_length_ratio', 'Raw_modification_signal'])
     # Preprocessing - Duplicate detection params
     duplicate_detection_site_types: List[str] = field(default_factory=lambda: ['GpC', 'CpG', 'ambiguous_GpC_CpG'])
     duplicate_detection_distance_threshold: float = 0.07
-    hamming_vs_metric_keys: List[str] = field(default_factory=lambda: ['Fraction_any_C_site_modified'])
+    hamming_vs_metric_keys: List[str] = field(default_factory=lambda: ['Fraction_C_site_modified'])
     duplicate_detection_keep_best_metric: str ='read_quality'
     duplicate_detection_window_size_for_hamming_neighbors: int = 50
     duplicate_detection_min_overlapping_positions: int = 20
@@ -703,20 +764,25 @@ class ExperimentConfig:
     # Preprocessing - Position QC
     position_max_nan_threshold: float = 0.1
-    # Basic Analysis - Clustermap params
+    # Spatial Analysis - Clustermap params
     layer_for_clustermap_plotting: Optional[str] = 'nan0_0minus1'
+    clustermap_cmap_c: Optional[str] = 'coolwarm'
+    clustermap_cmap_gpc: Optional[str] = 'coolwarm'
+    clustermap_cmap_cpg: Optional[str] = 'coolwarm'
+    clustermap_cmap_a: Optional[str] = 'coolwarm'
+    spatial_clustermap_sortby: Optional[str] = 'gpc'
-    # Basic Analysis - UMAP/Leiden params
+    # Spatial Analysis - UMAP/Leiden params
     layer_for_umap_plotting: Optional[str] = 'nan_half'
     umap_layers_to_plot: List[str] = field(default_factory=lambda: ["mapped_length", "Raw_modification_signal"])
-    # Basic Analysis - Spatial Autocorrelation params
+    # Spatial Analysis - Spatial Autocorrelation params
     rows_per_qc_autocorr_grid: int = 12
     autocorr_rolling_window_size: int = 25
     autocorr_max_lag: int = 800
-    autocorr_site_types: List[str] = field(default_factory=lambda: ['GpC', 'CpG', 'any_C'])
+    autocorr_site_types: List[str] = field(default_factory=lambda: ['GpC', 'CpG', 'C'])
-    # Basic Analysis - Correlation Matrix params
+    # Spatial Analysis - Correlation Matrix params
     correlation_matrix_types: List[str] = field(default_factory=lambda: ["pearson", "binary_covariance"])
     correlation_matrix_cmaps: List[str] = field(default_factory=lambda: ["seismic", "viridis"])
     correlation_matrix_site_types: List[str] = field(default_factory=lambda: ["GpC_site"])
@@ -738,6 +804,10 @@ class ExperimentConfig:
     cpg: Optional[bool] = False
     hmm_feature_sets: Dict[str, Any] = field(default_factory=dict)
     hmm_merge_layer_features: Optional[List[Tuple]] = field(default_factory=lambda: [(None, 80)])
+    clustermap_cmap_hmm: Optional[str] = 'coolwarm'
+    hmm_clustermap_feature_layers: List[str] = field(default_factory=lambda: ["all_accessible_features"])
+    hmm_clustermap_sortby: Optional[str] = 'hmm'
+    hmm_peak_feature_configs: Dict[str, Any] = field(default_factory=dict)
     # Pipeline control flow - load adata
     force_redo_load_adata: bool = False
@@ -763,8 +833,8 @@ class ExperimentConfig:
     bypass_complexity_analysis: bool = False
     force_redo_complexity_analysis: bool = False
-    # Pipeline control flow - Basic Analyses
-    force_redo_basic_analyses: bool = False
+    # Pipeline control flow - Spatial Analyses
+    force_redo_spatial_analyses: bool = False
     bypass_basic_clustermaps: bool = False
     force_redo_basic_clustermaps: bool = False
     bypass_basic_umap: bool = False
@@ -930,7 +1000,14 @@ class ExperimentConfig:
                 input_type = "h5ad"
                 input_files = found["h5ad_paths"]
-            print(f"Found {found['all_files_searched']} files; fastq={len(found["fastq_paths"])}, bam={len(found["bam_paths"])}, pod5={len(found["pod5_paths"])}, fast5={len(found["fast5_paths"])}, , h5ad={len(found["h5ad_paths"])}")
+            print(
+                f"Found {found['all_files_searched']} files; "
+                f"fastq={len(found['fastq_paths'])}, "
+                f"bam={len(found['bam_paths'])}, "
+                f"pod5={len(found['pod5_paths'])}, "
+                f"fast5={len(found['fast5_paths'])}, "
+                f"h5ad={len(found['h5ad_paths'])}"
+            )
         # summary file output path
         output_dir = Path(merged['output_directory'])
@@ -981,6 +1058,9 @@ class ExperimentConfig:
         if "mod_list" in merged:
             merged["mod_list"] = _parse_list(merged.get("mod_list"))
+        # Preprocessing args
+        obs_to_plot_pp_qc = _parse_list(merged.get("obs_to_plot_pp_qc", None))
         # HMM feature set handling
         if "hmm_feature_sets" in merged:
             merged["hmm_feature_sets"] = normalize_hmm_feature_sets(merged["hmm_feature_sets"])
@@ -1016,6 +1096,13 @@ class ExperimentConfig:
             hmm_methbases = ['C']
         hmm_methbases = list(hmm_methbases)
         hmm_merge_layer_features = _parse_list(merged.get("hmm_merge_layer_features", None))
+        hmm_clustermap_feature_layers = _parse_list(merged.get("hmm_clustermap_feature_layers", "all_accessible_features"))
+        # HMM peak feature configs (for call_hmm_peaks)
+        merged["hmm_peak_feature_configs"] = normalize_peak_feature_configs(
+            merged.get("hmm_peak_feature_configs", {})
+        )
+        hmm_peak_feature_configs = merged.get("hmm_peak_feature_configs", {})
         # instantiate dataclass
         instance = cls(
@@ -1047,8 +1134,9 @@ class ExperimentConfig:
             threads = merged.get("threads"),
             sample_sheet_path = merged.get("sample_sheet_path"),
             sample_sheet_mapping_column = merged.get("sample_sheet_mapping_column"),
-            delete_intermediate_bams = merged.get("delete_intermediate_bams", True),
+            delete_intermediate_bams = merged.get("delete_intermediate_bams", False),
             delete_intermediate_tsvs = merged.get("delete_intermediate_tsvs", True),
+            align_from_bam = merged.get("align_from_bam", False),
             aligner = merged.get("aligner", "minimap2"),
             aligner_args = merged.get("aligner_args", None),
             device = merged.get("device", "auto"),
@@ -1070,6 +1158,7 @@ class ExperimentConfig:
             reference_column = merged.get("reference_column", 'Reference_strand'),
             sample_column = merged.get("sample_column", 'Barcode'),
             sample_name_col_for_plotting = merged.get("sample_name_col_for_plotting", 'Barcode'),
+            obs_to_plot_pp_qc = obs_to_plot_pp_qc,
             fit_position_methylation_thresholds = merged.get("fit_position_methylation_thresholds", False),
             binarize_on_fixed_methlyation_threshold = merged.get("binarize_on_fixed_methlyation_threshold", 0.7),
             positive_control_sample_methylation_fitting = merged.get("positive_control_sample_methylation_fitting", None),
@@ -1078,14 +1167,21 @@ class ExperimentConfig:
             inference_variable_sample_methylation_fitting = merged.get("inference_variable_sample_methylation_fitting", "Raw_modification_signal"),
             fit_j_threshold = merged.get("fit_j_threshold", 0.5),
             output_binary_layer_name = merged.get("output_binary_layer_name", "binarized_methylation"),
+            reindexing_offsets = merged.get("reindexing_offsets", {None: None}),
+            reindexed_var_suffix = merged.get("reindexed_var_suffix", "reindexed"),
             layer_for_clustermap_plotting = merged.get("layer_for_clustermap_plotting", 'nan0_0minus1'),
+            clustermap_cmap_c = merged.get("clustermap_cmap_c", 'coolwarm'),
+            clustermap_cmap_gpc = merged.get("clustermap_cmap_gpc", 'coolwarm'),
+            clustermap_cmap_cpg = merged.get("clustermap_cmap_cpg", 'coolwarm'),
+            clustermap_cmap_a = merged.get("clustermap_cmap_a", 'coolwarm'),
+            spatial_clustermap_sortby = merged.get("spatial_clustermap_sortby", 'gpc'),
             layer_for_umap_plotting = merged.get("layer_for_umap_plotting", 'nan_half'),
             umap_layers_to_plot = merged.get("umap_layers_to_plot",["mapped_length", 'Raw_modification_signal']),
             rows_per_qc_histogram_grid = merged.get("rows_per_qc_histogram_grid", 12),
             rows_per_qc_autocorr_grid = merged.get("rows_per_qc_autocorr_grid", 12),
             autocorr_rolling_window_size = merged.get("autocorr_rolling_window_size", 25),
             autocorr_max_lag = merged.get("autocorr_max_lag", 800),
-            autocorr_site_types = merged.get("autocorr_site_types", ['GpC', 'CpG', 'any_C']),
+            autocorr_site_types = merged.get("autocorr_site_types", ['GpC', 'CpG', 'C']),
             hmm_n_states = merged.get("hmm_n_states", 2),
             hmm_init_emission_probs = merged.get("hmm_init_emission_probs",[[0.8, 0.2], [0.2, 0.8]]),
             hmm_init_transition_probs = merged.get("hmm_init_transition_probs",[[0.9, 0.1], [0.1, 0.9]]),
@@ -1099,6 +1195,10 @@ class ExperimentConfig:
             hmm_methbases = hmm_methbases,
             hmm_device = hmm_device,
             hmm_merge_layer_features = hmm_merge_layer_features,
+            clustermap_cmap_hmm = merged.get("clustermap_cmap_hmm", 'coolwarm'),
+            hmm_clustermap_feature_layers = hmm_clustermap_feature_layers,
+            hmm_clustermap_sortby = merged.get("hmm_clustermap_sortby", 'hmm'),
+            hmm_peak_feature_configs = hmm_peak_feature_configs,
             footprints = merged.get("footprints", None),
             accessible_patches = merged.get("accessible_patches", None),
             cpg = merged.get("cpg", None),
@@ -1109,7 +1209,7 @@ class ExperimentConfig:
             read_mapping_quality_filter_thresholds = merged.get("read_mapping_quality_filter_thresholds", [None, None]),
             read_mod_filtering_gpc_thresholds = merged.get("read_mod_filtering_gpc_thresholds", [0.025, 0.975]),
             read_mod_filtering_cpg_thresholds = merged.get("read_mod_filtering_cpg_thresholds", [0.0, 1.0]),
-            read_mod_filtering_any_c_thresholds = merged.get("read_mod_filtering_any_c_thresholds", [0.025, 0.975]),
+            read_mod_filtering_c_thresholds = merged.get("read_mod_filtering_c_thresholds", [0.025, 0.975]),
             read_mod_filtering_a_thresholds = merged.get("read_mod_filtering_a_thresholds", [0.025, 0.975]),
             read_mod_filtering_use_other_c_as_background = merged.get("read_mod_filtering_use_other_c_as_background", True),
             min_valid_fraction_positions_in_read_vs_ref = merged.get("min_valid_fraction_positions_in_read_vs_ref", 0.2),
@@ -1125,7 +1225,7 @@ class ExperimentConfig:
             correlation_matrix_types = merged.get("correlation_matrix_types", ["pearson", "binary_covariance"]),
             correlation_matrix_cmaps = merged.get("correlation_matrix_cmaps", ["seismic", "viridis"]),
             correlation_matrix_site_types = merged.get("correlation_matrix_site_types", ["GpC_site"]),
-            hamming_vs_metric_keys = merged.get("hamming_vs_metric_keys", ['Fraction_any_C_site_modified']),
+            hamming_vs_metric_keys = merged.get("hamming_vs_metric_keys", ['Fraction_C_site_modified']),
             force_redo_load_adata = merged.get("force_redo_load_adata", False),
             force_redo_preprocessing = merged.get("force_redo_preprocessing", False),
             force_reload_sample_sheet = merged.get("force_reload_sample_sheet", True),
@@ -1146,7 +1246,7 @@ class ExperimentConfig:
             force_redo_flag_duplicate_reads = merged.get("force_redo_flag_duplicate_reads", False),
             bypass_complexity_analysis = merged.get("bypass_complexity_analysis", False),
             force_redo_complexity_analysis = merged.get("force_redo_complexity_analysis", False),
-            force_redo_basic_analyses = merged.get("force_redo_basic_analyses", False),
+            force_redo_spatial_analyses = merged.get("force_redo_spatial_analyses", False),
             bypass_basic_clustermaps = merged.get("bypass_basic_clustermaps", False),
             force_redo_basic_clustermaps = merged.get("force_redo_basic_clustermaps", False),
             bypass_basic_umap = merged.get("bypass_basic_umap", False),
@@ -1198,6 +1298,7 @@ class ExperimentConfig:
     # -------------------------
     # validation & serialization
     # -------------------------
+    @staticmethod
     def _validate_hmm_features_structure(hfs: dict) -> List[str]:
         errs = []
         if not isinstance(hfs, dict):

smftools 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl

smftools 0.2.3py3-none-any.whl → 0.2.4py3-none-any.whl