PyPI - smftools - Versions diffs - 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

smftools 0.2.4py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (181) hide show

smftools/__init__.py +43 -13
smftools/_settings.py +6 -6
smftools/_version.py +3 -1
smftools/cli/__init__.py +1 -0
smftools/cli/archived/cli_flows.py +2 -0
smftools/cli/helpers.py +9 -1
smftools/cli/hmm_adata.py +905 -242
smftools/cli/load_adata.py +432 -280
smftools/cli/preprocess_adata.py +287 -171
smftools/cli/spatial_adata.py +141 -53
smftools/cli_entry.py +119 -178
smftools/config/__init__.py +3 -1
smftools/config/conversion.yaml +5 -1
smftools/config/deaminase.yaml +1 -1
smftools/config/default.yaml +26 -18
smftools/config/direct.yaml +8 -3
smftools/config/discover_input_files.py +19 -5
smftools/config/experiment_config.py +511 -276
smftools/constants.py +37 -0
smftools/datasets/__init__.py +4 -8
smftools/datasets/datasets.py +32 -18
smftools/hmm/HMM.py +2133 -1428
smftools/hmm/__init__.py +24 -14
smftools/hmm/archived/apply_hmm_batched.py +2 -0
smftools/hmm/archived/calculate_distances.py +2 -0
smftools/hmm/archived/call_hmm_peaks.py +18 -1
smftools/hmm/archived/train_hmm.py +2 -0
smftools/hmm/call_hmm_peaks.py +176 -193
smftools/hmm/display_hmm.py +23 -7
smftools/hmm/hmm_readwrite.py +20 -6
smftools/hmm/nucleosome_hmm_refinement.py +104 -14
smftools/informatics/__init__.py +55 -13
smftools/informatics/archived/bam_conversion.py +2 -0
smftools/informatics/archived/bam_direct.py +2 -0
smftools/informatics/archived/basecall_pod5s.py +2 -0
smftools/informatics/archived/basecalls_to_adata.py +2 -0
smftools/informatics/archived/conversion_smf.py +2 -0
smftools/informatics/archived/deaminase_smf.py +1 -0
smftools/informatics/archived/direct_smf.py +2 -0
smftools/informatics/archived/fast5_to_pod5.py +2 -0
smftools/informatics/archived/helpers/archived/__init__.py +2 -0
smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +16 -1
smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
smftools/informatics/archived/helpers/archived/informatics.py +2 -0
smftools/informatics/archived/helpers/archived/load_adata.py +5 -3
smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
smftools/informatics/archived/helpers/archived/modQC.py +2 -0
smftools/informatics/archived/helpers/archived/modcall.py +2 -0
smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +5 -1
smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
smftools/informatics/archived/print_bam_query_seq.py +9 -1
smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
smftools/informatics/archived/subsample_pod5.py +2 -0
smftools/informatics/bam_functions.py +1059 -269
smftools/informatics/basecalling.py +53 -9
smftools/informatics/bed_functions.py +357 -114
smftools/informatics/binarize_converted_base_identities.py +21 -7
smftools/informatics/complement_base_list.py +9 -6
smftools/informatics/converted_BAM_to_adata.py +324 -137
smftools/informatics/fasta_functions.py +251 -89
smftools/informatics/h5ad_functions.py +202 -30
smftools/informatics/modkit_extract_to_adata.py +623 -274
smftools/informatics/modkit_functions.py +87 -44
smftools/informatics/ohe.py +46 -21
smftools/informatics/pod5_functions.py +114 -74
smftools/informatics/run_multiqc.py +20 -14
smftools/logging_utils.py +51 -0
smftools/machine_learning/__init__.py +23 -12
smftools/machine_learning/data/__init__.py +2 -0
smftools/machine_learning/data/anndata_data_module.py +157 -50
smftools/machine_learning/data/preprocessing.py +4 -1
smftools/machine_learning/evaluation/__init__.py +3 -1
smftools/machine_learning/evaluation/eval_utils.py +13 -14
smftools/machine_learning/evaluation/evaluators.py +52 -34
smftools/machine_learning/inference/__init__.py +3 -1
smftools/machine_learning/inference/inference_utils.py +9 -4
smftools/machine_learning/inference/lightning_inference.py +14 -13
smftools/machine_learning/inference/sklearn_inference.py +8 -8
smftools/machine_learning/inference/sliding_window_inference.py +37 -25
smftools/machine_learning/models/__init__.py +12 -5
smftools/machine_learning/models/base.py +34 -43
smftools/machine_learning/models/cnn.py +22 -13
smftools/machine_learning/models/lightning_base.py +78 -42
smftools/machine_learning/models/mlp.py +18 -5
smftools/machine_learning/models/positional.py +10 -4
smftools/machine_learning/models/rnn.py +8 -3
smftools/machine_learning/models/sklearn_models.py +46 -24
smftools/machine_learning/models/transformer.py +75 -55
smftools/machine_learning/models/wrappers.py +8 -3
smftools/machine_learning/training/__init__.py +4 -2
smftools/machine_learning/training/train_lightning_model.py +42 -23
smftools/machine_learning/training/train_sklearn_model.py +11 -15
smftools/machine_learning/utils/__init__.py +3 -1
smftools/machine_learning/utils/device.py +12 -5
smftools/machine_learning/utils/grl.py +8 -2
smftools/metadata.py +443 -0
smftools/optional_imports.py +31 -0
smftools/plotting/__init__.py +32 -17
smftools/plotting/autocorrelation_plotting.py +153 -48
smftools/plotting/classifiers.py +175 -73
smftools/plotting/general_plotting.py +350 -168
smftools/plotting/hmm_plotting.py +53 -14
smftools/plotting/position_stats.py +155 -87
smftools/plotting/qc_plotting.py +25 -12
smftools/preprocessing/__init__.py +35 -37
smftools/preprocessing/append_base_context.py +105 -79
smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +2 -0
smftools/preprocessing/{archives → archived}/calculate_complexity.py +5 -1
smftools/preprocessing/{archives → archived}/mark_duplicates.py +2 -0
smftools/preprocessing/{archives → archived}/preprocessing.py +10 -6
smftools/preprocessing/{archives → archived}/remove_duplicates.py +2 -0
smftools/preprocessing/binarize.py +21 -4
smftools/preprocessing/binarize_on_Youden.py +127 -31
smftools/preprocessing/binary_layers_to_ohe.py +18 -11
smftools/preprocessing/calculate_complexity_II.py +89 -59
smftools/preprocessing/calculate_consensus.py +28 -19
smftools/preprocessing/calculate_coverage.py +44 -22
smftools/preprocessing/calculate_pairwise_differences.py +4 -1
smftools/preprocessing/calculate_pairwise_hamming_distances.py +7 -3
smftools/preprocessing/calculate_position_Youden.py +110 -55
smftools/preprocessing/calculate_read_length_stats.py +52 -23
smftools/preprocessing/calculate_read_modification_stats.py +91 -57
smftools/preprocessing/clean_NaN.py +38 -28
smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
smftools/preprocessing/filter_reads_on_length_quality_mapping.py +72 -37
smftools/preprocessing/filter_reads_on_modification_thresholds.py +183 -73
smftools/preprocessing/flag_duplicate_reads.py +708 -303
smftools/preprocessing/invert_adata.py +26 -11
smftools/preprocessing/load_sample_sheet.py +40 -22
smftools/preprocessing/make_dirs.py +9 -3
smftools/preprocessing/min_non_diagonal.py +4 -1
smftools/preprocessing/recipes.py +58 -23
smftools/preprocessing/reindex_references_adata.py +93 -27
smftools/preprocessing/subsample_adata.py +33 -16
smftools/readwrite.py +264 -109
smftools/schema/__init__.py +11 -0
smftools/schema/anndata_schema_v1.yaml +227 -0
smftools/tools/__init__.py +25 -18
smftools/tools/archived/apply_hmm.py +2 -0
smftools/tools/archived/classifiers.py +165 -0
smftools/tools/archived/classify_methylated_features.py +2 -0
smftools/tools/archived/classify_non_methylated_features.py +2 -0
smftools/tools/archived/subset_adata_v1.py +12 -1
smftools/tools/archived/subset_adata_v2.py +14 -1
smftools/tools/calculate_umap.py +56 -15
smftools/tools/cluster_adata_on_methylation.py +122 -47
smftools/tools/general_tools.py +70 -25
smftools/tools/position_stats.py +220 -99
smftools/tools/read_stats.py +50 -29
smftools/tools/spatial_autocorrelation.py +365 -192
smftools/tools/subset_adata.py +23 -21
smftools-0.3.0.dist-info/METADATA +147 -0
smftools-0.3.0.dist-info/RECORD +182 -0
smftools-0.2.4.dist-info/METADATA +0 -141
smftools-0.2.4.dist-info/RECORD +0 -176
{smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
{smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
{smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0

smftools/cli_entry.py CHANGED Viewed

@@ -1,19 +1,63 @@
+from __future__ import annotations
+import logging
+from pathlib import Path
+from typing import Sequence
 import click
 import pandas as pd
-from pathlib import Path
-from typing import Dict, Optional, Sequence
+from .cli.hmm_adata import hmm_adata
 from .cli.load_adata import load_adata
 from .cli.preprocess_adata import preprocess_adata
 from .cli.spatial_adata import spatial_adata
-from .cli.hmm_adata import hmm_adata
+from .informatics.pod5_functions import subsample_pod5
+from .logging_utils import get_logger, setup_logging
+from .readwrite import concatenate_h5ads
+def _configure_multiprocessing() -> None:
+    import multiprocessing as mp
+    import sys
+    logger = get_logger(__name__)
+    try:
+        if sys.platform == "win32":
+            mp.set_start_method("spawn")
+            logger.debug("Setting multiprocessing start method to spawn")
+        else:
+            # try forkserver first, fallback to spawn
+            try:
+                mp.set_start_method("forkserver")
+                logger.debug("Setting multiprocessing start method to forkserver")
+            except ValueError:
+                mp.set_start_method("spawn")
+                logger.debug("Setting multiprocessing start method to spawn")
+    except RuntimeError:
+        logger.warning("Could not set multiprocessing start method")
-from .readwrite import safe_read_h5ad, safe_write_h5ad, concatenate_h5ads
 @click.group()
-def cli():
+@click.option(
+    "--log-file",
+    type=click.Path(dir_okay=False, writable=True, path_type=Path),
+    default=None,
+    help="Optional file path to write smftools logs.",
+)
+@click.option(
+    "--log-level",
+    type=click.Choice(["CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"], case_sensitive=False),
+    default="INFO",
+    show_default=True,
+    help="Logging level for smftools output.",
+)
+def cli(log_file: Path | None, log_level: str):
     """Command-line interface for smftools."""
-    pass
+    level = getattr(logging, log_level.upper(), logging.INFO)
+    setup_logging(level=level, log_file=log_file)
+    _configure_multiprocessing()
 ####### Load anndata from raw data ###########
 @cli.command()
@@ -21,32 +65,44 @@ def cli():
 def load(config_path):
     """Load and process data from CONFIG_PATH."""
     load_adata(config_path)
 ##########################################
 ####### Preprocessing ###########
 @cli.command()
 @click.argument("config_path", type=click.Path(exists=True))
 def preprocess(config_path):
     """Preprocess data from CONFIG_PATH."""
     preprocess_adata(config_path)
 ##########################################
 ####### Spatial ###########
 @cli.command()
 @click.argument("config_path", type=click.Path(exists=True))
 def spatial(config_path):
     """Process data from CONFIG_PATH."""
     spatial_adata(config_path)
 ##########################################
 ####### HMM ###########
 @cli.command()
 @click.argument("config_path", type=click.Path(exists=True))
 def hmm(config_path):
     """Process data from CONFIG_PATH."""
     hmm_adata(config_path)
 ##########################################
 ####### batch command ###########
 @cli.command()
 @click.argument(
@@ -125,7 +181,9 @@ def batch(task, config_table: Path, column: str, sep: str | None):
                     dtype=str,
                 )
             except Exception as e:
-                raise click.ClickException(f"Failed to read {config_table} as headerless list: {e}") from e
+                raise click.ClickException(
+                    f"Failed to read {config_table} as headerless list: {e}"
+                ) from e
             config_series = df[column]
         else:
@@ -136,12 +194,7 @@ def batch(task, config_table: Path, column: str, sep: str | None):
                 )
             config_series = df[column]
-        config_paths = (
-            config_series.dropna()
-            .map(str)
-            .map(lambda p: Path(p).expanduser())
-            .tolist()
-        )
+        config_paths = config_series.dropna().map(str).map(lambda p: Path(p).expanduser()).tolist()
     # ----------------------------
     # Validate config paths
@@ -162,9 +215,7 @@ def batch(task, config_table: Path, column: str, sep: str | None):
     func = task_funcs[task]
-    click.echo(
-        f"Running task '{task}' on {len(config_paths)} config paths from {config_table}"
-    )
+    click.echo(f"Running task '{task}' on {len(config_paths)} config paths from {config_table}")
     # ----------------------------
     # Loop over paths
@@ -177,13 +228,16 @@ def batch(task, config_table: Path, column: str, sep: str | None):
         click.echo(f"[{i}/{len(config_paths)}] {task} → {cfg}")
         try:
-            func(str(cfg))   # underlying functions take a string path
+            func(str(cfg))  # underlying functions take a string path
         except Exception as e:
             click.echo(f"  ERROR on {cfg}: {e}")
     click.echo("Batch processing complete.")
 ##########################################
 ####### concatenate command ###########
 @cli.command("concatenate")
 @click.argument(
@@ -269,166 +323,53 @@ def concatenate_cmd(
     except Exception as e:
         raise click.ClickException(str(e)) from e
 ##########################################
-####### Merging existing anndatas from an experiment that used two different demultiplexing rules #######
-# REQUIRED_KEYS = ("adata_single_path", "adata_double_path")
-# OPTIONAL_KEYS = (
-#     "adata_single_backups_path",
-#     "adata_double_backups_path",
-#     "output_path",
-#     "merged_filename",
-# )
-# def _read_config_csv(csv_path: Path) -> Dict[str, str]:
-#     """
-#     Read a multi-row, two-column CSV of key,value pairs into a dict.
-#     Supported features:
-#       - Optional header ("key,value") or none.
-#       - Comments starting with '#' and blank lines are ignored.
-#       - If duplicate keys occur, the last one wins.
-#       - Keys are matched literally against REQUIRED_KEYS/OPTIONAL_KEYS.
-#     """
-#     try:
-#         # Read as two columns regardless of header; comments ignored.
-#         df = pd.read_csv(
-#             csv_path,
-#             dtype=str,
-#             comment="#",
-#             header=None,          # treat everything as rows; we'll normalize below
-#             usecols=[0, 1],
-#             names=["key", "value"]
-#         )
-#     except Exception as e:
-#         raise click.ClickException(f"Failed to read CSV: {e}") from e
-#     # Drop completely empty rows
-#     df = df.fillna("").astype(str)
-#     df["key"] = df["key"].str.strip()
-#     df["value"] = df["value"].str.strip()
-#     df = df[(df["key"] != "") & (df["key"].notna())]
-#     if df.empty:
-#         raise click.ClickException("Config CSV is empty after removing comments/blank lines.")
-#     # Remove an optional header row if present
-#     if df.iloc[0]["key"].lower() in {"key", "keys"}:
-#         df = df.iloc[1:]
-#         df = df[(df["key"] != "") & (df["key"].notna())]
-#         if df.empty:
-#             raise click.ClickException("Config CSV contains only a header row.")
-#     # Build dict; last occurrence of a key wins
-#     cfg = {}
-#     for k, v in zip(df["key"], df["value"]):
-#         cfg[k] = v
-#     # Validate required keys
-#     missing = [k for k in REQUIRED_KEYS if not cfg.get(k)]
-#     if missing:
-#         raise click.ClickException(
-#             "Missing required keys in CSV: "
-#             + ", ".join(missing)
-#             + "\nExpected keys:\n  - "
-#             + "\n  - ".join(REQUIRED_KEYS)
-#             + "\nOptional keys:\n  - "
-#             + "\n  - ".join(OPTIONAL_KEYS)
-#         )
-#     return cfg
-# def _resolve_output_path(cfg: Dict[str, str], single_path: Path, double_path: Path) -> Path:
-#     """Decide on the output .h5ad path based on CSV; create directories if needed."""
-#     merged_filename = cfg.get("merged_filename") or f"merged_{single_path.stem}__{double_path.stem}.h5ad"
-#     if not merged_filename.endswith(".h5ad"):
-#         merged_filename += ".h5ad"
-#     output_path_raw = cfg.get("output_path", "").strip()
-#     if not output_path_raw:
-#         out_dir = Path.cwd() / "merged_output"
-#         out_dir.mkdir(parents=True, exist_ok=True)
-#         return out_dir / merged_filename
-#     output_path = Path(output_path_raw)
-#     if output_path.suffix.lower() == ".h5ad":
-#         output_path.parent.mkdir(parents=True, exist_ok=True)
-#         return output_path
-#     # Treat as directory
-#     output_path.mkdir(parents=True, exist_ok=True)
-#     return output_path / merged_filename
-# def _maybe_read_adata(label: str, primary: Path, backups: Optional[Path]):
-#     if backups:
-#         click.echo(f"Loading {label} from {primary} with backups at {backups} ...")
-#         return safe_read_h5ad(primary, backups_path=backups, restore_backups=True)
-#     else:
-#         click.echo(f"Loading {label} from {primary} with backups disabled ...")
-#         return safe_read_h5ad(primary, restore_backups=False)
-# @cli.command()
-# @click.argument("config_path", type=click.Path(exists=True, dir_okay=False, readable=True, path_type=Path))
-# def merge_barcoded_anndatas(config_path: Path):
-#     """
-#     Merge two AnnData objects from the same experiment that were demultiplexed
-#     under different end-barcoding requirements, using a 1-row CSV for config.
-#     CSV must include:
-#       - adata_single_path
-#       - adata_double_path
-#     Optional columns:
-#       - adata_single_backups_path
-#       - adata_double_backups_path
-#       - output_path            (file or directory; default: ./merged_output/)
-#       - merged_filename        (default: merged_<single>__<double>.h5ad)
-#     Example CSV:
-#         adata_single_path,adata_double_path,adata_single_backups_path,adata_double_backups_path,output_path,merged_filename
-#         /path/single.h5ad,/path/double.h5ad,,,,merged_output,merged_run.h5ad
-#     """
-#     try:
-#         cfg = _read_config_csv(config_path)
-#         single_path = Path(cfg["adata_single_path"]).expanduser().resolve()
-#         double_path = Path(cfg["adata_double_path"]).expanduser().resolve()
-#         for p, label in [(single_path, "adata_single_path"), (double_path, "adata_double_path")]:
-#             if not p.exists():
-#                 raise click.ClickException(f"{label} does not exist: {p}")
-#         single_backups = Path(cfg["adata_single_backups_path"]).expanduser().resolve() if cfg.get("adata_single_backups_path") else None
-#         double_backups = Path(cfg["adata_double_backups_path"]).expanduser().resolve() if cfg.get("adata_double_backups_path") else None
-#         if single_backups and not single_backups.exists():
-#             raise click.ClickException(f"adata_single_backups_path does not exist: {single_backups}")
-#         if double_backups and not double_backups.exists():
-#             raise click.ClickException(f"adata_double_backups_path does not exist: {double_backups}")
-#         output_path = _resolve_output_path(cfg, single_path, double_path)
-#         # Load
-#         adata_single, read_report_single = _maybe_read_adata("single-barcoded AnnData", single_path, single_backups)
-#         adata_double, read_report_double = _maybe_read_adata("double-barcoded AnnData", double_path, double_backups)
-#         click.echo("Merging AnnDatas ...")
-#         merged = merge_barcoded_anndatas_core(adata_single, adata_double)
-#         click.echo(f"Writing merged AnnData to: {output_path}")
-#         backup_dir = output_path.cwd() / "merged_backups"
-#         safe_write_h5ad(merged, output_path, backup=True, backup_dir=backup_dir)
-#         click.secho(f"Done. Merged AnnData saved to {output_path}", fg="green")
-#     except click.ClickException:
-#         raise
-#     except Exception as e:
-#         # Surface unexpected errors cleanly
-#         raise click.ClickException(f"Unexpected error: {e}") from e
-################################################################################################################
+####### subsample pod5 command ###########
+@cli.command("subsample-pod5")
+@click.argument(
+    "pod5_path",
+    type=click.Path(exists=True, path_type=Path),
+)
+@click.option(
+    "--read-names",
+    "-r",
+    type=click.Path(exists=True, path_type=Path),
+    default=None,
+    help="Text file with one read_id per line.",
+)
+@click.option(
+    "--n-reads",
+    "-n",
+    type=int,
+    default=None,
+    help="Randomly subsample N reads.",
+)
+@click.option(
+    "--outdir",
+    "-o",
+    type=click.Path(path_type=Path, file_okay=False),
+    required=True,
+    help="Output directory for subsampled POD5.",
+)
+def subsample_pod5_cmd(pod5_path, read_names, n_reads, outdir):
+    """
+    Subsample POD5 file(s) by read ID list or random sampling.
+    """
+    # --- Validate mutually exclusive options ---
+    if (read_names is None and n_reads is None) or (read_names and n_reads):
+        raise click.UsageError("You must specify exactly ONE of --read-names or --n-reads.")
+    outdir.mkdir(parents=True, exist_ok=True)
+    subsample_arg = str(read_names) if read_names else n_reads
+    subsample_pod5(
+        pod5_path=str(pod5_path),
+        read_name_path=subsample_arg,
+        output_directory=str(outdir),
+    )

smftools/config/__init__.py CHANGED Viewed

@@ -1 +1,3 @@
-from .experiment_config import LoadExperimentConfig, ExperimentConfig
+from __future__ import annotations
+from .experiment_config import ExperimentConfig, LoadExperimentConfig

smftools/config/conversion.yaml CHANGED Viewed

@@ -9,6 +9,10 @@ conversion_types:
 # Read QC Params
 read_mod_filtering_use_other_c_as_background: True
+# Spatial Analysis - Autocorr params
+autocorr_site_types:
+  - "GpC"
 # Spatial Analysis - Clustermap params
 layer_for_clustermap_plotting: 'nan0_0minus1'
 clustermap_cmap_c: "coolwarm"
@@ -42,4 +46,4 @@ hmm_feature_sets:
       cpg_patch: [0, inf]
 hmm_merge_layer_features:
-  - ["GpC_all_accessible_features", 80]
+  - ["all_accessible_features", 60]

smftools/config/deaminase.yaml CHANGED Viewed

@@ -60,4 +60,4 @@ hmm_feature_sets:
       nucleosome_depleted_region: [110, inf]
 hmm_merge_layer_features:
-  - ["C_all_accessible_features", 80]
+  - ["all_accessible_features", 60]

smftools/config/default.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 # General
 sample_sheet_path: null # path to sample_sheet to load metadata into anndata.
-sample_sheet_mapping_column: 'Barcode' # The column in the sample sheet and current anndata to use for mapping metadata.
-sample_name_col_for_plotting: 'Barcode'
+sample_sheet_mapping_column: 'Experiment_name_and_barcode' # The column in the sample sheet and current anndata to use for mapping metadata.
+sample_name_col_for_plotting: 'Experiment_name_and_barcode'
 # Compute params
 threads: 4
@@ -9,9 +9,7 @@ device: "auto"
 ######## smftools load params #########
 # Generic i/o
-bam_suffix: ".bam"
 recursive_input_search: True
-split_dir: "demultiplexed_BAMs"
 strands:
   - bottom
   - top
@@ -21,7 +19,7 @@ fastq_barcode_map: null # For FASTQ files, an optional map of file paths to barc
 fastq_auto_pairing: True # For FASTQ files, attempt to find read pair files automatically.
 input_already_demuxed: False # If the input files are already demultiplexed.
 delete_intermediate_hdfs: True # Whether to delete the intermediate hdfs from the conversion/deamination workflows.
-delete_intermediate_bams: False # Whether to delete intermediate BAM files.
+delete_intermediate_bams: True # Whether to delete intermediate BAM files.
 delete_intermediate_tsvs: True # Whether to delete intermediate TSV files.
 # Sequencing modality and general experiment params
@@ -53,7 +51,6 @@ aligner_args:
       - '-y'
       - '-N'
       - '5'
-      - '--secondary=no'
     pacbio:
       - '-a'
       - '-x'
@@ -63,7 +60,6 @@ aligner_args:
       - '-y'
       - '-N'
       - '5'
-      - '--secondary=no'
     illumina:
       - '-a'
       - '-x'
@@ -73,7 +69,6 @@ aligner_args:
       - '-y'
       - '-N'
       - '5'
-      - '--secondary=no'
   dorado:
     ont:
       - "--mm2-opts"
@@ -82,15 +77,18 @@ aligner_args:
 # Sorted BAM and BED specific handling
 make_bigwigs: False # Whether to make coverage bigwigs
 make_beds: False # Whether to make beds from the aligned bams
+samtools_backend: auto # auto|python|cli for samtools-compatible operations
+bedtools_backend: auto # auto|python|cli for bedtools-compatible operations
+bigwig_backend: auto # auto|python|cli for bedGraphToBigWig conversion
 # Nanopore specific demultiplexing
 barcode_both_ends: False # dorado demultiplexing
 trim: False # dorado adapter and barcode removal during demultiplexing
 # Anndata structure
-mapping_threshold: 0.01 # Minimum proportion of mapped reads that need to fall within a region to include in the final AnnData.
+mapping_threshold: 0.10 # Minimum proportion of mapped reads that need to fall within a region to include in the final AnnData.
 reference_column: 'Reference_strand'
-sample_column: 'Barcode'
+sample_column: 'Experiment_name_and_barcode'
 ######## smftools preprocess params #########
 # Read length, quality, and mapping filtering params
@@ -101,7 +99,7 @@ read_len_filter_thresholds:
   - 100
   - null
 read_len_to_ref_ratio_filter_thresholds:
-  - 0.5
+  - null
   - null
 read_quality_filter_thresholds:
   - 15
@@ -179,13 +177,12 @@ umap_layers_to_plot:
   - "Raw_modification_signal"
 # Spatial Analysis - Spatial Autocorrelation params
+autocorr_normalization_method: "pearson" # options are pearson or sum
 rows_per_qc_autocorr_grid: 6
 autocorr_rolling_window_size: 25
 autocorr_max_lag: 800
 autocorr_site_types:
   - "GpC"
-  - "CpG"
-  - "C"
 # Spatial Analysis - Correlation Matrix params
 correlation_matrix_types:
@@ -210,10 +207,19 @@ hmm_init_start_probs:
   - 0.5
   - 0.5
 hmm_eps: 1e-8
+# Fitting strategy
+hmm_fit_strategy: "per_group"      # "per_group" | "shared_transitions"
+hmm_shared_scope: ["reference", "methbase"]
+hmm_groupby: ["sample", "reference", "methbase"]
+# If hmm_fit_strategy == shared_transitions
+hmm_adapt_emissions: true
+hmm_adapt_startprobs: true
+hmm_emission_adapt_iters: 5
+hmm_emission_adapt_tol: 1.0e-4
 hmm_dtype: "float64"
-hmm_annotation_threshold: 0.5
-hmm_batch_size: 1024
-hmm_use_viterbi: False
+hmm_annotation_threshold: 0.5 # The minimum probability threshold of a feature interval to accept it for layer annotation.
+hmm_batch_size: 1024 # hmm batch size
+hmm_use_viterbi: False # Whether to use viterbi decoding. If False, uses forward-backward gammas. Viterbi is smoother, but less sensitive.
 footprints: True # whether to use the default HMM footprint params
 accessible_patches: True # whether to use the default HMM accessible patch params
 cpg: False # whether to use the default HMM endogenous CpG patch params
@@ -238,7 +244,7 @@ hmm_feature_sets:
       large_accessible_patch: [40, 110]
       nucleosome_depleted_region: [110, inf]
 hmm_merge_layer_features:
-  - [null, 80]
+  - ["all_accessible_features", 60]
 clustermap_cmap_hmm: "coolwarm"
 hmm_clustermap_feature_layers:
   - all_accessible_features
@@ -246,7 +252,9 @@ hmm_clustermap_feature_layers:
   - small_accessible_patch
   - mid_accessible_patch
   - large_accessible_patch
+  - large_accessible_patch_merged
   - nucleosome_depleted_region
+  - nucleosome_depleted_region_merged
   - small_bound_stretch
   - medium_bound_stretch
   - putative_nucleosome
@@ -365,4 +373,4 @@ force_redo_matrix_corr_plotting: False # Whether to force redo basic correlation
 bypass_hmm_fit: False # Whether to skip HMM fitting for each sample/reference
 force_redo_hmm_fit: False # Whether to redo HMM fitting for each sample/reference
 bypass_hmm_apply: False # Whether to skip HMM application for each sample/reference
-force_redo_hmm_apply: False # Whether to redo HMM application for each sample/reference
+force_redo_hmm_apply: False # Whether to redo HMM application for each sample/reference

smftools/config/direct.yaml CHANGED Viewed

@@ -27,10 +27,10 @@ delete_batch_hdfs: True # Whether to delete intermediate barcode level hdfs afte
 ######## smftools preprocess params ########
 fit_position_methylation_thresholds: False # Whether to use Youden J-stat to determine position by positions thresholds for modification binarization.
-binarize_on_fixed_methlyation_threshold: 0.7 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
+binarize_on_fixed_methlyation_threshold: 0.5 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
 positive_control_sample_methylation_fitting: null # A positive control Sample_name to use for fully modified template data
 negative_control_sample_methylation_fitting: null # A negative control Sample_name to use for fully unmodified template data
-infer_on_percentile_sample_methylation_fitting: 10 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
+infer_on_percentile_sample_methylation_fitting: 5 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
 inference_variable_sample_methylation_fitting: "Raw_modification_signal" # The obs column value used for the percentile metric above.
 fit_j_threshold: 0.5 # The J-statistic threhold to use for determining which positions pass qc for mod detection thresholding
 output_binary_layer_name: "binarized_methylation" # The layer to store the binarized methylation data in
@@ -39,6 +39,11 @@ output_binary_layer_name: "binarized_methylation" # The layer to store the binar
 autocorr_site_types:
   - "A"
+spatial_clustermap_sortby: "a"
 ######## smftools hmm params #########
 hmm_methbases:
-  - "A"
+  - "A"
+hmm_merge_layer_features:
+  - ["A_all_accessible_features", 60]

smftools/config/discover_input_files.py CHANGED Viewed

@@ -1,11 +1,14 @@
 from __future__ import annotations
 from pathlib import Path
-from typing import Dict, List, Any, Iterable, Union
+from typing import Any, Dict, List, Union
+from smftools.constants import BAM_SUFFIX
 def discover_input_files(
     input_data_path: Union[str, Path],
-    bam_suffix: str = ".bam",
+    bam_suffix: str = BAM_SUFFIX,
     recursive: bool = False,
     follow_symlinks: bool = False,
 ) -> Dict[str, Any]:
@@ -30,10 +33,21 @@ def discover_input_files(
     bam_suffix = bam_suffix.lower()
     # Sets of canonical extension keys we’ll compare against
-    pod5_exts  = {".pod5", ".p5"}
+    pod5_exts = {".pod5", ".p5"}
     fast5_exts = {".fast5", ".f5"}
-    fastq_exts = {".fastq", ".fq", ".fastq.gz", ".fq.gz", ".fastq.bz2", ".fq.bz2", ".fastq.xz", ".fq.xz", ".fastq.zst", ".fq.zst"}
-    h5ad_exts  = {".h5ad", ".h5"}
+    fastq_exts = {
+        ".fastq",
+        ".fq",
+        ".fastq.gz",
+        ".fq.gz",
+        ".fastq.bz2",
+        ".fq.bz2",
+        ".fastq.xz",
+        ".fq.xz",
+        ".fastq.zst",
+        ".fq.zst",
+    }
+    h5ad_exts = {".h5ad", ".h5"}
     compressed_exts = {".gz", ".bz2", ".xz", ".zst"}
     def ext_key(pp: Path) -> str:

smftools 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl

smftools 0.2.4py3-none-any.whl → 0.3.0py3-none-any.whl