PyPI - smftools - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl - Mend

smftools 0.2.1py3-none-any.whl → 0.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (114) hide show

smftools/config/default.yaml CHANGED Viewed

@@ -1,3 +1,13 @@
+# General
+sample_sheet_path: null # path to sample_sheet to load metadata into anndata.
+sample_sheet_mapping_column: 'Barcode' # The column in the sample sheet and current anndata to use for mapping metadata.
+sample_name_col_for_plotting: 'Barcode'
+# Compute params
+threads: 4
+device: "auto"
+######## smftools load params #########
 # Generic i/o
 bam_suffix: ".bam"
 recursive_input_search: True
@@ -7,16 +17,12 @@ strands:
   - top
 conversions:
   - unconverted
-sample_sheet_path: null # path to sample_sheet to load metadata into anndata.
-sample_sheet_mapping_column: 'Barcode' # The column in the sample sheet and current anndata to use for mapping metadata.
 fastq_barcode_map: null # For FASTQ files, an optional map of file paths to barcodes can be provided. Default is autodetecting barcodes.
 fastq_auto_pairing: True # For FASTQ files, attempt to find read pair files automatically.
 input_already_demuxed: False # If the input files are already demultiplexed.
 delete_intermediate_hdfs: True # Whether to delete the intermediate hdfs from the conversion/deamination workflows.
-# Compute params
-threads: 4
-device: "auto"
+delete_intermediate_bams: False # Whether to delete intermediate BAM files.
+delete_intermediate_tsvs: True # Whether to delete intermediate TSV files.
 # Sequencing modality and general experiment params
 smf_modality: 'conversion' # conversion, deaminase, direct
@@ -34,7 +40,8 @@ model: "hac" # needed for dorado basecaller
 filter_threshold: 0.8 # Dorado probability filter threshold for base calling.
 # Alignment params
-aligner: "minimap2" # Aligner to use: dorado, minimap2
+aligner: "dorado" # Aligner to use: dorado, minimap2
+align_from_bam: False # Whether to run alignment from a bam file for minimap2. If False, runs alignment from a FASTQ file.
 aligner_args:
   minimap2:
     ont:
@@ -70,11 +77,11 @@ aligner_args:
   dorado:
     ont:
       - "--mm2-opts"
-      - "-N"
-      - "5"
+      - "-N 5"
 # Sorted BAM and BED specific handling
 make_bigwigs: False # Whether to make coverage bigwigs
+make_beds: False # Whether to make beds from the aligned bams
 # Nanopore specific demultiplexing
 barcode_both_ends: False # dorado demultiplexing
@@ -85,47 +92,58 @@ mapping_threshold: 0.01 # Minimum proportion of mapped reads that need to fall w
 reference_column: 'Reference_strand'
 sample_column: 'Barcode'
-# Preprocessing -  Read length, quality, and mapping filtering params
+######## smftools preprocess params #########
+# Read length, quality, and mapping filtering params
 read_coord_filter:
  - null
  - null
 read_len_filter_thresholds:
-  - 200
+  - 100
   - null
 read_len_to_ref_ratio_filter_thresholds:
-  - 0.8
+  - 0.5
   - null
 read_quality_filter_thresholds:
-  - 20
+  - 15
   - null
 read_mapping_quality_filter_thresholds:
   - null
   - null
-# Preprocessing -  Read modification filtering params
+# Read modification filtering params
 read_mod_filtering_gpc_thresholds:
   - 0.025
   - 0.975
 read_mod_filtering_cpg_thresholds:
   - 0.0
   - 1.0
-read_mod_filtering_any_c_thresholds:
+read_mod_filtering_c_thresholds:
   - 0.025
   - 0.975
 read_mod_filtering_a_thresholds:
   - 0.025
   - 0.975
 read_mod_filtering_use_other_c_as_background: False
-min_valid_fraction_positions_in_read_vs_ref: 0.8
+min_valid_fraction_positions_in_read_vs_ref: 0.5
-# Preprocessing - Duplicate detection params
+# Plotting params for read length histograms
+obs_to_plot_pp_qc:
+  - read_length
+  - mapped_length
+  - read_quality
+  - mapping_quality
+  - mapped_length_to_reference_length_ratio
+  - mapped_length_to_read_length_ratio
+  - Raw_modification_signal
+# Duplicate detection params
 duplicate_detection_site_types: # Site types to consider for duplicate detection workflow
   - "GpC"
   - "CpG"
   - "ambiguous_GpC_CpG"
 duplicate_detection_distance_threshold: 0.07 # Hamming distance based similarity threshold to use for marking duplicate reads.
 hamming_vs_metric_keys: # Metrics to plot the hamming distance against.
-  - Fraction_any_C_site_modified
+  - Fraction_C_site_modified
 duplicate_detection_keep_best_metric: "read_quality" # Obs metric to use to keep a representative read from a read duplicate cluster
 duplicate_detection_window_size_for_hamming_neighbors: 50 # How many neighboring reads to look at for calculating hamming distance pairs
 duplicate_detection_min_overlapping_positions: 20 # The minimum amount of valid overlapping positions that will allow duplicate detection to work
@@ -133,33 +151,43 @@ duplicate_detection_do_hierarchical: True # Whether to follow up fwd/rev lexicog
 duplicate_detection_hierarchical_linkage: "average" # Method for hierarchical clustering distance calculation
 duplicate_detection_do_pca: False # Whether to do PCA before hierarchical linkage based duplicate detection.
-# Preprocessing - Complexity analysis params
+# Position QC params
+position_max_nan_threshold: 0.1 # The maximum amount of nans to tolerate in a column
-# General Plotting params
-sample_name_col_for_plotting: 'Barcode'
+######## smftools spatial params #########
+invert_adata: False # Whether to invert the AnnData along the positions axis.
+# Reindexing params
+reindexing_offsets:
+  null : null
+reindexed_var_suffix: "reindexed"
-# Basic Analysis - QC Plotting params
+# Spatial Analysis - QC Plotting params
 rows_per_qc_histogram_grid: 12
-# Basic Analysis - Clustermap params
+# Spatial Analysis - Clustermap params
 layer_for_clustermap_plotting: 'nan0_0minus1'
+clustermap_cmap_c: "coolwarm"
+clustermap_cmap_gpc: "coolwarm"
+clustermap_cmap_cpg: "coolwarm"
+clustermap_cmap_a: "coolwarm"
+spatial_clustermap_sortby: "gpc"
-# Basic Analysis - UMAP/Leiden params
+# Spatial Analysis - UMAP/Leiden params
 layer_for_umap_plotting: 'nan_half'
 umap_layers_to_plot:
   - "mapped_length"
   - "Raw_modification_signal"
-# Basic Analysis - Spatial Autocorrelation params
+# Spatial Analysis - Spatial Autocorrelation params
 rows_per_qc_autocorr_grid: 6
 autocorr_rolling_window_size: 25
 autocorr_max_lag: 800
 autocorr_site_types:
   - "GpC"
   - "CpG"
-  - "any_C"
+  - "C"
-# Basic Analysis - Correlation Matrix params
+# Spatial Analysis - Correlation Matrix params
 correlation_matrix_types:
   - "pearson"
   - "binary_covariance"
@@ -169,6 +197,7 @@ correlation_matrix_cmaps:
 correlation_matrix_site_types:
   - "GpC_site"
+######## smftools hmm params #########
 # HMM params
 hmm_n_states: 2 # Number of HMM states
 hmm_init_emission_probs:
@@ -197,18 +226,105 @@ hmm_feature_sets:
   footprint:
     state: "Non-Modified"
     features:
-      small_bound_stretch: [0, 25]
-      medium_bound_stretch: [25, 80]
-      putative_nucleosome: [80, 200]
+      small_bound_stretch: [6, 40]
+      medium_bound_stretch: [40, 100]
+      putative_nucleosome: [100, 200]
       large_bound_stretch: [200, inf]
   accessible:
     state: "Modified"
     features:
-      small_accessible_patch: [0, 20]
-      mid_accessible_patch: [20, 100]
-      large_accessible_patch: [100, inf]
+      small_accessible_patch: [3, 20]
+      mid_accessible_patch: [20, 40]
+      large_accessible_patch: [40, 110]
+      nucleosome_depleted_region: [110, inf]
 hmm_merge_layer_features:
   - [null, 80]
+clustermap_cmap_hmm: "coolwarm"
+hmm_clustermap_feature_layers:
+  - all_accessible_features
+  - all_accessible_features_merged
+  - small_accessible_patch
+  - mid_accessible_patch
+  - large_accessible_patch
+  - nucleosome_depleted_region
+  - small_bound_stretch
+  - medium_bound_stretch
+  - putative_nucleosome
+  - large_bound_stretch
+hmm_clustermap_sortby: "hmm"
+hmm_peak_feature_configs:
+  all_accessible_features:
+    min_distance: 200 # The minimum distance in between called peaks
+    peak_width: 200 # The window width to calculate sum/mean hmm signal per read centered at the peak center.
+    peak_prominence: 0.1 # The minimum prominence to call a peak
+    peak_threshold: 0.80 # The minimum mean hmm signal in each molecule within the peak window to mark the molecule as positive for the feature.
+    rolling_window: 50 # Window size for the rolling average smoothing before peak calling
+  all_accessible_features_merged:
+    min_distance: 250
+    peak_width: 250
+    peak_prominence: 0.05
+    peak_threshold: 0.80
+    rolling_window: 50
+  small_accessible_patch:
+    min_distance: 40
+    peak_width: 30
+    peak_prominence: 0.1
+    peak_threshold: 0.8
+    rolling_window: 40
+  mid_accessible_patch:
+    min_distance: 100
+    peak_width: 60
+    peak_prominence: 0.025
+    peak_threshold: 0.80
+    rolling_window: 50
+  large_accessible_patch:
+    min_distance: 100
+    peak_width: 100
+    peak_prominence: 0.025
+    peak_threshold: 0.80
+    rolling_window: 50
+  nucleosome_depleted_region:
+    min_distance: 200
+    peak_width: 200
+    peak_prominence: 0.025
+    peak_threshold: 0.80
+    rolling_window: 50
+  small_bound_stretch:
+    min_distance: 20
+    peak_width: 20
+    peak_prominence: 0.01
+    peak_threshold: 0.50
+    rolling_window: 10
+  medium_bound_stretch:
+    min_distance: 40
+    peak_width: 40
+    peak_prominence: 0.01
+    peak_threshold: 0.50
+    rolling_window: 20
+  putative_nucleosome:
+    min_distance: 160
+    peak_width: 147     # canonical nucleosome footprint
+    peak_prominence: 0.025
+    peak_threshold: 0.60
+    rolling_window: 20
+  large_bound_stretch:
+    min_distance: 250
+    peak_width: 300
+    peak_prominence: 0.20
+    peak_threshold: 0.80
+    rolling_window: 50
+# Pipeline control flow - load adata
+force_redo_load_adata: False # Whether to perform load adata command from start
 # Pipeline control flow - Preprocessing and QC
 force_redo_preprocessing: False # Whether to force redo the entire preprocessing workflow from the initial raw anndata.
@@ -219,7 +335,6 @@ bypass_clean_nan: False # Whether to skip NaN cleaning
 force_redo_clean_nan: False # Whether to redo NaN cleaning
 bypass_append_base_context: False # Whether to skip adding per reference base context additions.
 force_redo_append_base_context: False # Whether to redo per reference base context additions.
-invert_adata: False # Whether to invert the AnnData along the positions axis.
 bypass_append_binary_layer_by_base_context: False # Whether to skip adding new binary layers for each specific base context.
 force_redo_append_binary_layer_by_base_context: False # Whether to redo adding new binary layers for each specific base context.
 bypass_calculate_read_modification_stats: False # Whether to skip adding read level modification statistics.
@@ -231,8 +346,8 @@ force_redo_flag_duplicate_reads: False # Whether to redo flagging duplicate read
 bypass_complexity_analysis: False # Whether to skip complexity analysis
 force_redo_complexity_analysis: False # Whether to redo complexity analysis
-# Pipeline control flow - Basic Analyses
-force_redo_basic_analyses: False # Whether to force redo the entire basic analysis pipeline from the AnnData
+# Pipeline control flow - Spatial Analyses
+force_redo_spatial_analyses: False # Whether to force redo the entire basic analysis pipeline from the AnnData
 bypass_basic_clustermaps: False # Whether to skip basic clustermap plotting
 force_redo_basic_clustermaps: False # Whether to redo basic clustermap plotting
 bypass_basic_umap: False # Whether to skip basic UMAP calculation/plotting

smftools/config/direct.yaml CHANGED Viewed

@@ -1,5 +1,7 @@
 # Direct (Nanopore modified base calling)footprinting defaults
 extends: default
+######## smftools load params #########
 filter_threshold: 0.8 # min threshold to call a canononical base
 m6A_threshold: 0.7 # min threshold to call a modified m6a base
 m5C_threshold: 0.7 # min threshold to call a modified 5mC base
@@ -12,6 +14,31 @@ thresholds:
 mod_list:
   - '5mC_5hmC'
   - '6mA' # mods to detect
+mod_map:
+  5mC_5hmC: 5mC
+  6mA: 6mA
+mod_target_bases:
+  - "A"
+enzyme_target_bases:
+  - "A"
 batch_size: 4 # How many mod TSVs to load into memory at a time when making anndata batches
 skip_unclassified: True # Whether to skip unclassified barcodes
-delete_batch_hdfs: True # Whether to delete intermediate barcode level hdfs after making final anndata
+delete_batch_hdfs: True # Whether to delete intermediate barcode level hdfs after making final anndata
+######## smftools preprocess params ########
+fit_position_methylation_thresholds: False # Whether to use Youden J-stat to determine position by positions thresholds for modification binarization.
+binarize_on_fixed_methlyation_threshold: 0.7 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
+positive_control_sample_methylation_fitting: null # A positive control Sample_name to use for fully modified template data
+negative_control_sample_methylation_fitting: null # A negative control Sample_name to use for fully unmodified template data
+infer_on_percentile_sample_methylation_fitting: 10 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
+inference_variable_sample_methylation_fitting: "Raw_modification_signal" # The obs column value used for the percentile metric above.
+fit_j_threshold: 0.5 # The J-statistic threhold to use for determining which positions pass qc for mod detection thresholding
+output_binary_layer_name: "binarized_methylation" # The layer to store the binarized methylation data in
+######## smftools spatial params #########
+autocorr_site_types:
+  - "A"
+######## smftools hmm params #########
+hmm_methbases:
+  - "A"

smftools/config/discover_input_files.py ADDED Viewed

@@ -0,0 +1,115 @@
+from __future__ import annotations
+from pathlib import Path
+from typing import Dict, List, Any, Iterable, Union
+def discover_input_files(
+    input_data_path: Union[str, Path],
+    bam_suffix: str = ".bam",
+    recursive: bool = False,
+    follow_symlinks: bool = False,
+) -> Dict[str, Any]:
+    """
+    Discover input files under `input_data_path`.
+    Returns a dict with:
+      - pod5_paths, fast5_paths, fastq_paths, bam_paths, other_paths (lists of Path)
+      - input_is_pod5, input_is_fast5, input_is_fastq, input_is_bam (bools)
+      - all_files_searched (int)
+    Behavior:
+      - If `input_data_path` is a file, returns that single file categorized.
+      - If a directory, scans immediate children (recursive=False) or entire tree (recursive=True).
+      - Handles multi-suffix files like .fastq.gz, .fq.xz, etc.
+    """
+    p = Path(input_data_path)
+    # normalize bam suffix with a leading dot and lower
+    if not bam_suffix.startswith("."):
+        bam_suffix = "." + bam_suffix
+    bam_suffix = bam_suffix.lower()
+    # Sets of canonical extension keys we’ll compare against
+    pod5_exts  = {".pod5", ".p5"}
+    fast5_exts = {".fast5", ".f5"}
+    fastq_exts = {".fastq", ".fq", ".fastq.gz", ".fq.gz", ".fastq.bz2", ".fq.bz2", ".fastq.xz", ".fq.xz", ".fastq.zst", ".fq.zst"}
+    h5ad_exts  = {".h5ad", ".h5"}
+    compressed_exts = {".gz", ".bz2", ".xz", ".zst"}
+    def ext_key(pp: Path) -> str:
+        """
+        A robust extension key: last suffix, or last two if the final one is a compressor (.gz/.bz2/.xz/.zst).
+        Examples:
+          a.fastq.gz -> ".fastq.gz"
+          a.fq.xz    -> ".fq.xz"
+          a.bam      -> ".bam"
+          a          -> ""
+        """
+        suff = [s.lower() for s in pp.suffixes]
+        if not suff:
+            return ""
+        if suff[-1] in compressed_exts and len(suff) >= 2:
+            return suff[-2] + suff[-1]
+        return suff[-1]
+    pod5_paths: List[Path] = []
+    fast5_paths: List[Path] = []
+    fastq_paths: List[Path] = []
+    bam_paths: List[Path] = []
+    h5ad_paths: List[Path] = []
+    other_paths: List[Path] = []
+    def categorize_file(fp: Path) -> None:
+        key = ext_key(fp)
+        if key in pod5_exts:
+            pod5_paths.append(fp)
+        elif key in fast5_exts:
+            fast5_paths.append(fp)
+        elif key in fastq_exts:
+            fastq_paths.append(fp)
+        elif key in h5ad_exts:
+            h5ad_paths.append(fp)
+        elif key == bam_suffix:
+            bam_paths.append(fp)
+        else:
+            other_paths.append(fp)
+    if not p.exists():
+        raise FileNotFoundError(f"input_data_path does not exist: {input_data_path}")
+    total_searched = 0
+    if p.is_file():
+        total_searched = 1
+        categorize_file(p)
+    else:
+        # Directory scan
+        if recursive:
+            # Python 3.12+ supports follow_symlinks in glob/rglob. Fallback for older versions.
+            try:
+                iterator = p.rglob("*", follow_symlinks=follow_symlinks)  # type: ignore[call-arg]
+            except TypeError:
+                iterator = p.rglob("*")  # follow_symlinks not supported
+        else:
+            iterator = p.iterdir()
+        for fp in iterator:
+            if not fp.is_file():
+                continue
+            total_searched += 1
+            categorize_file(fp)
+    return {
+        "pod5_paths": sorted(pod5_paths),
+        "fast5_paths": sorted(fast5_paths),
+        "fastq_paths": sorted(fastq_paths),
+        "bam_paths": sorted(bam_paths),
+        "h5ad_paths": sorted(h5ad_paths),
+        "other_paths": sorted(other_paths),
+        "input_is_pod5": len(pod5_paths) > 0,
+        "input_is_fast5": len(fast5_paths) > 0,
+        "input_is_fastq": len(fastq_paths) > 0,
+        "input_is_bam": len(bam_paths) > 0,
+        "input_is_h5ad": len(h5ad_paths) > 0,
+        "all_files_searched": total_searched,
+    }

smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl

smftools 0.2.1py3-none-any.whl → 0.2.4py3-none-any.whl