PyPI - smftools - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl - Mend

smftools 0.1.6py3-none-any.whl → 0.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (137) hide show

smftools/__init__.py +29 -0
smftools/_settings.py +20 -0
smftools/_version.py +1 -0
smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
smftools/datasets/F1_sample_sheet.csv +5 -0
smftools/datasets/__init__.py +9 -0
smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
smftools/datasets/datasets.py +28 -0
smftools/informatics/__init__.py +16 -0
smftools/informatics/archived/bam_conversion.py +59 -0
smftools/informatics/archived/bam_direct.py +63 -0
smftools/informatics/archived/basecalls_to_adata.py +71 -0
smftools/informatics/archived/print_bam_query_seq.py +29 -0
smftools/informatics/basecall_pod5s.py +80 -0
smftools/informatics/conversion_smf.py +132 -0
smftools/informatics/direct_smf.py +137 -0
smftools/informatics/fast5_to_pod5.py +21 -0
smftools/informatics/helpers/LoadExperimentConfig.py +75 -0
smftools/informatics/helpers/__init__.py +74 -0
smftools/informatics/helpers/align_and_sort_BAM.py +59 -0
smftools/informatics/helpers/aligned_BAM_to_bed.py +74 -0
smftools/informatics/helpers/archived/informatics.py +260 -0
smftools/informatics/helpers/archived/load_adata.py +516 -0
smftools/informatics/helpers/bam_qc.py +66 -0
smftools/informatics/helpers/bed_to_bigwig.py +39 -0
smftools/informatics/helpers/binarize_converted_base_identities.py +79 -0
smftools/informatics/helpers/canoncall.py +34 -0
smftools/informatics/helpers/complement_base_list.py +21 -0
smftools/informatics/helpers/concatenate_fastqs_to_bam.py +55 -0
smftools/informatics/helpers/converted_BAM_to_adata.py +245 -0
smftools/informatics/helpers/converted_BAM_to_adata_II.py +369 -0
smftools/informatics/helpers/count_aligned_reads.py +43 -0
smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
smftools/informatics/helpers/extract_base_identities.py +44 -0
smftools/informatics/helpers/extract_mods.py +83 -0
smftools/informatics/helpers/extract_read_features_from_bam.py +31 -0
smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
smftools/informatics/helpers/find_conversion_sites.py +50 -0
smftools/informatics/helpers/generate_converted_FASTA.py +99 -0
smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
smftools/informatics/helpers/get_native_references.py +28 -0
smftools/informatics/helpers/index_fasta.py +12 -0
smftools/informatics/helpers/make_dirs.py +21 -0
smftools/informatics/helpers/make_modbed.py +27 -0
smftools/informatics/helpers/modQC.py +27 -0
smftools/informatics/helpers/modcall.py +36 -0
smftools/informatics/helpers/modkit_extract_to_adata.py +884 -0
smftools/informatics/helpers/ohe_batching.py +76 -0
smftools/informatics/helpers/ohe_layers_decode.py +32 -0
smftools/informatics/helpers/one_hot_decode.py +27 -0
smftools/informatics/helpers/one_hot_encode.py +57 -0
smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +53 -0
smftools/informatics/helpers/run_multiqc.py +28 -0
smftools/informatics/helpers/separate_bam_by_bc.py +43 -0
smftools/informatics/helpers/split_and_index_BAM.py +36 -0
smftools/informatics/load_adata.py +182 -0
smftools/informatics/readwrite.py +106 -0
smftools/informatics/subsample_fasta_from_bed.py +47 -0
smftools/informatics/subsample_pod5.py +104 -0
smftools/plotting/__init__.py +15 -0
smftools/plotting/classifiers.py +355 -0
smftools/plotting/general_plotting.py +205 -0
smftools/plotting/position_stats.py +462 -0
smftools/preprocessing/__init__.py +33 -0
smftools/preprocessing/append_C_context.py +82 -0
smftools/preprocessing/archives/mark_duplicates.py +146 -0
smftools/preprocessing/archives/preprocessing.py +614 -0
smftools/preprocessing/archives/remove_duplicates.py +21 -0
smftools/preprocessing/binarize_on_Youden.py +45 -0
smftools/preprocessing/binary_layers_to_ohe.py +40 -0
smftools/preprocessing/calculate_complexity.py +72 -0
smftools/preprocessing/calculate_consensus.py +47 -0
smftools/preprocessing/calculate_converted_read_methylation_stats.py +94 -0
smftools/preprocessing/calculate_coverage.py +42 -0
smftools/preprocessing/calculate_pairwise_differences.py +49 -0
smftools/preprocessing/calculate_pairwise_hamming_distances.py +27 -0
smftools/preprocessing/calculate_position_Youden.py +115 -0
smftools/preprocessing/calculate_read_length_stats.py +79 -0
smftools/preprocessing/clean_NaN.py +46 -0
smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
smftools/preprocessing/filter_converted_reads_on_methylation.py +44 -0
smftools/preprocessing/filter_reads_on_length.py +51 -0
smftools/preprocessing/flag_duplicate_reads.py +149 -0
smftools/preprocessing/invert_adata.py +30 -0
smftools/preprocessing/load_sample_sheet.py +38 -0
smftools/preprocessing/make_dirs.py +21 -0
smftools/preprocessing/min_non_diagonal.py +25 -0
smftools/preprocessing/recipes.py +127 -0
smftools/preprocessing/subsample_adata.py +58 -0
smftools/readwrite.py +198 -0
smftools/tools/__init__.py +49 -0
smftools/tools/apply_hmm.py +202 -0
smftools/tools/apply_hmm_batched.py +241 -0
smftools/tools/archived/classify_methylated_features.py +66 -0
smftools/tools/archived/classify_non_methylated_features.py +75 -0
smftools/tools/archived/subset_adata_v1.py +32 -0
smftools/tools/archived/subset_adata_v2.py +46 -0
smftools/tools/calculate_distances.py +18 -0
smftools/tools/calculate_umap.py +62 -0
smftools/tools/call_hmm_peaks.py +105 -0
smftools/tools/classifiers.py +787 -0
smftools/tools/cluster_adata_on_methylation.py +105 -0
smftools/tools/data/__init__.py +2 -0
smftools/tools/data/anndata_data_module.py +90 -0
smftools/tools/data/preprocessing.py +6 -0
smftools/tools/display_hmm.py +18 -0
smftools/tools/evaluation/__init__.py +0 -0
smftools/tools/general_tools.py +69 -0
smftools/tools/hmm_readwrite.py +16 -0
smftools/tools/inference/__init__.py +1 -0
smftools/tools/inference/lightning_inference.py +41 -0
smftools/tools/models/__init__.py +9 -0
smftools/tools/models/base.py +14 -0
smftools/tools/models/cnn.py +34 -0
smftools/tools/models/lightning_base.py +41 -0
smftools/tools/models/mlp.py +17 -0
smftools/tools/models/positional.py +17 -0
smftools/tools/models/rnn.py +16 -0
smftools/tools/models/sklearn_models.py +40 -0
smftools/tools/models/transformer.py +133 -0
smftools/tools/models/wrappers.py +20 -0
smftools/tools/nucleosome_hmm_refinement.py +104 -0
smftools/tools/position_stats.py +239 -0
smftools/tools/read_stats.py +70 -0
smftools/tools/subset_adata.py +28 -0
smftools/tools/train_hmm.py +78 -0
smftools/tools/training/__init__.py +1 -0
smftools/tools/training/train_lightning_model.py +47 -0
smftools/tools/utils/__init__.py +2 -0
smftools/tools/utils/device.py +10 -0
smftools/tools/utils/grl.py +14 -0
{smftools-0.1.6.dist-info → smftools-0.1.7.dist-info}/METADATA +5 -2
smftools-0.1.7.dist-info/RECORD +136 -0
smftools-0.1.6.dist-info/RECORD +0 -4
{smftools-0.1.6.dist-info → smftools-0.1.7.dist-info}/WHEEL +0 -0
{smftools-0.1.6.dist-info → smftools-0.1.7.dist-info}/licenses/LICENSE +0 -0

smftools/readwrite.py ADDED Viewed

@@ -0,0 +1,198 @@
+## readwrite ##
+######################################################################################################
+## Datetime functionality
+def date_string():
+    """
+    Each time this is called, it returns the current date string
+    """
+    from datetime import datetime
+    current_date = datetime.now()
+    date_string = current_date.strftime("%Y%m%d")
+    date_string = date_string[2:]
+    return date_string
+def time_string():
+    """
+    Each time this is called, it returns the current time string
+    """
+    from datetime import datetime
+    current_time = datetime.now()
+    return current_time.strftime("%H:%M:%S")
+######################################################################################################
+######################################################################################################
+## Numpy, Pandas, Anndata functionality
+def adata_to_df(adata, layer=None):
+    """
+    Convert an AnnData object into a Pandas DataFrame.
+    Parameters:
+        adata (AnnData): The input AnnData object.
+        layer (str, optional): The layer to extract. If None, uses adata.X.
+    Returns:
+        pd.DataFrame: A DataFrame where rows are observations and columns are positions.
+    """
+    import pandas as pd
+    import anndata as ad
+    import numpy as np
+    # Validate that the requested layer exists
+    if layer and layer not in adata.layers:
+        raise ValueError(f"Layer '{layer}' not found in adata.layers.")
+    # Extract the data matrix
+    data_matrix = adata.layers.get(layer, adata.X)
+    # Ensure matrix is dense (handle sparse formats)
+    if hasattr(data_matrix, "toarray"):
+        data_matrix = data_matrix.toarray()
+    # Ensure obs and var have unique indices
+    if adata.obs.index.duplicated().any():
+        raise ValueError("Duplicate values found in `adata.obs.index`. Ensure unique observation indices.")
+    if adata.var.index.duplicated().any():
+        raise ValueError("Duplicate values found in `adata.var.index`. Ensure unique variable indices.")
+    # Convert to DataFrame
+    df = pd.DataFrame(data_matrix, index=adata.obs.index, columns=adata.var.index)
+    return df
+def save_matrix(matrix, save_name):
+    """
+    Input: A numpy matrix and a save_name
+    Output: A txt file representation of the data matrix
+    """
+    import numpy as np
+    np.savetxt(f'{save_name}.txt', matrix)
+def concatenate_h5ads(output_file, file_suffix='h5ad.gz', delete_inputs=True):
+    """
+    Concatenate all h5ad files in a directory and delete them after the final adata is written out.
+    Input: an output file path relative to the directory in which the function is called
+    """
+    import os
+    import anndata as ad
+    # Runtime warnings
+    import warnings
+    warnings.filterwarnings('ignore', category=UserWarning, module='anndata')
+    warnings.filterwarnings('ignore', category=FutureWarning, module='anndata')
+    # List all files in the directory
+    files = os.listdir(os.getcwd())
+    # get current working directory
+    cwd = os.getcwd()
+    suffix = file_suffix
+    # Filter file names that contain the search string in their filename and keep them in a list
+    hdfs = [hdf for hdf in files if suffix in hdf]
+    # Sort file list by names and print the list of file names
+    hdfs.sort()
+    print('{0} sample files found: {1}'.format(len(hdfs), hdfs))
+    # Iterate over all of the hdf5 files and concatenate them.
+    final_adata = None
+    for hdf in hdfs:
+        print('{0}: Reading in {1} hdf5 file'.format(time_string(), hdf))
+        temp_adata = ad.read_h5ad(hdf)
+        if final_adata:
+            print('{0}: Concatenating final adata object with {1} hdf5 file'.format(time_string(), hdf))
+            final_adata = ad.concat([final_adata, temp_adata], join='outer', index_unique=None)
+        else:
+            print('{0}: Initializing final adata object with {1} hdf5 file'.format(time_string(), hdf))
+            final_adata = temp_adata
+    print('{0}: Writing final concatenated hdf5 file'.format(time_string()))
+    final_adata.write_h5ad(output_file, compression='gzip')
+    # Delete the individual h5ad files and only keep the final concatenated file
+    if delete_inputs:
+        files = os.listdir(os.getcwd())
+        hdfs = [hdf for hdf in files if suffix in hdf]
+        if output_file in hdfs:
+            hdfs.remove(output_file)
+            # Iterate over the files and delete them
+            for hdf in hdfs:
+                try:
+                    os.remove(hdf)
+                    print(f"Deleted file: {hdf}")
+                except OSError as e:
+                    print(f"Error deleting file {hdf}: {e}")
+    else:
+        print('Keeping input files')
+def safe_write_h5ad(adata, path, compression="gzip", backup=False, backup_dir="./"):
+    """
+    Saves an AnnData object safely by omitting problematic columns from .obs and .var.
+    Parameters:
+        adata (AnnData): The AnnData object to save.
+        path (str): Output .h5ad file path.
+        compression (str): Compression method for h5ad file.
+        backup (bool): If True, saves problematic columns to CSV files.
+        backup_dir (str): Directory to store backups if backup=True.
+    """
+    import anndata as ad
+    import pandas as pd
+    import os
+    os.makedirs(backup_dir, exist_ok=True)
+    def filter_df(df, df_name):
+        bad_cols = []
+        for col in df.columns:
+            if df[col].dtype == 'object':
+                if not df[col].apply(lambda x: isinstance(x, (str, type(None)))).all():
+                    bad_cols.append(col)
+        if bad_cols:
+            print(f"⚠️ Skipping columns from {df_name}: {bad_cols}")
+            if backup:
+                df[bad_cols].to_csv(os.path.join(backup_dir, f"{df_name}_skipped_columns.csv"))
+                print(f"📝 Backed up skipped columns to {backup_dir}/{df_name}_skipped_columns.csv")
+        return df.drop(columns=bad_cols)
+    # Clean obs and var
+    obs_clean = filter_df(adata.obs, "obs")
+    var_clean = filter_df(adata.var, "var")
+    # Save clean version
+    adata_copy = ad.AnnData(
+        X=adata.X,
+        obs=obs_clean,
+        var=var_clean,
+        layers=adata.layers,
+        uns=adata.uns,
+        obsm=adata.obsm,
+        varm=adata.varm
+    )
+    adata_copy.write_h5ad(path, compression=compression)
+    print(f"✅ Saved safely to {path}")
+def merge_barcoded_anndatas(adata_single, adata_double):
+    import numpy as np
+    import anndata as ad
+    # Step 1: Identify overlap
+    overlap = np.intersect1d(adata_single.obs_names, adata_double.obs_names)
+    # Step 2: Filter out overlaps from adata_single
+    adata_single_filtered = adata_single[~adata_single.obs_names.isin(overlap)].copy()
+    # Step 3: Add source tag
+    adata_single_filtered.obs['source'] = 'single_barcode'
+    adata_double.obs['source'] = 'double_barcode'
+    # Step 4: Concatenate all components
+    adata_merged = ad.concat([
+        adata_single_filtered,
+        adata_double
+    ], join='outer', merge='same')  # merge='same' preserves matching layers, obsm, etc.
+    # Step 5: Merge `.uns`
+    adata_merged.uns = {**adata_single.uns, **adata_double.uns}
+    return adata_merged
+######################################################################################################

smftools/tools/__init__.py ADDED Viewed

@@ -0,0 +1,49 @@
+from .apply_hmm import apply_hmm
+from .apply_hmm_batched import apply_hmm_batched
+from .position_stats import calculate_relative_risk_on_activity, compute_positionwise_statistic
+from .calculate_distances import calculate_distances
+from .calculate_umap import calculate_umap
+from .call_hmm_peaks import call_hmm_peaks
+from .classifiers import run_training_loop, run_inference, evaluate_models_by_subgroup, prepare_melted_model_data, sliding_window_train_test
+from .cluster_adata_on_methylation import cluster_adata_on_methylation
+from .display_hmm import display_hmm
+from .general_tools import create_nan_mask_from_X, combine_layers, create_nan_or_non_gpc_mask
+from .hmm_readwrite import load_hmm, save_hmm
+from .nucleosome_hmm_refinement import refine_nucleosome_calls, infer_nucleosomes_in_large_bound
+from .read_stats import calculate_row_entropy
+from .subset_adata import subset_adata
+from .train_hmm import train_hmm
+from . import models
+from . import data
+from . import utils
+from . import evaluation
+from . import inference
+from . import training
+__all__ = [
+    "apply_hmm",
+    "apply_hmm_batched",
+    "calculate_distances",
+    "compute_positionwise_statistic",
+    "calculate_row_entropy",
+    "calculate_umap",
+    "calculate_relative_risk_on_activity",
+    "call_hmm_peaks",
+    "cluster_adata_on_methylation",
+    "create_nan_mask_from_X",
+    "create_nan_or_non_gpc_mask",
+    "combine_layers",
+    "display_hmm",
+    "evaluate_models_by_subgroup",
+    "load_hmm",
+    "prepare_melted_model_data",
+    "refine_nucleosome_calls",
+    "infer_nucleosomes_in_large_bound",
+    "run_training_loop",
+    "run_inference",
+    "save_hmm",
+    "sliding_window_train_test"
+    "subset_adata",
+    "train_hmm"
+]

smftools/tools/apply_hmm.py ADDED Viewed

@@ -0,0 +1,202 @@
+import numpy as np
+import pandas as pd
+import torch
+from tqdm import tqdm
+def apply_hmm(adata, model, obs_column, layer=None, footprints=True, accessible_patches=False, cpg=False, methbases=["GpC", "CpG", "A"], device="cpu", threshold=0.7):
+    """
+    Applies an HMM model to an AnnData object using tensor-based sequence inputs.
+    If multiple methbases are passed, generates a combined feature set.
+    """
+    model.to(device)
+    # --- Feature Definitions ---
+    feature_sets = {}
+    if footprints:
+        feature_sets["footprint"] = {
+            "features": {
+                "small_bound_stretch": [0, 30],
+                "medium_bound_stretch": [30, 80],
+                "putative_nucleosome": [80, 200],
+                "large_bound_stretch": [200, np.inf]
+            },
+            "state": "Non-Methylated"
+        }
+    if accessible_patches:
+        feature_sets["accessible"] = {
+            "features": {
+                "small_accessible_patch": [0, 30],
+                "mid_accessible_patch": [30, 80],
+                "large_accessible_patch": [80, np.inf]
+            },
+            "state": "Methylated"
+        }
+    if cpg:
+        feature_sets["cpg"] = {
+            "features": {
+                "cpg_patch": [0, np.inf]
+            },
+            "state": "Methylated"
+        }
+    # --- Init columns ---
+    all_features = []
+    combined_prefix = "Combined"
+    for key, fs in feature_sets.items():
+        if key == 'cpg':
+            all_features += [f"CpG_{f}" for f in fs["features"]]
+            all_features.append(f"CpG_all_{key}_features")
+        else:
+            for methbase in methbases:
+                all_features += [f"{methbase}_{f}" for f in fs["features"]]
+                all_features.append(f"{methbase}_all_{key}_features")
+            all_features += [f"{combined_prefix}_{f}" for f in fs["features"]]
+            all_features.append(f"{combined_prefix}_all_{key}_features")
+    for feature in all_features:
+        adata.obs[feature] = pd.Series([[] for _ in range(adata.shape[0])], dtype=object, index=adata.obs.index)
+        adata.obs[f"{feature}_distances"] = pd.Series([None] * adata.shape[0])
+        adata.obs[f"n_{feature}"] = -1
+    # --- Main loop ---
+    references = adata.obs[obs_column].cat.categories
+    for ref in tqdm(references, desc="Processing References"):
+        ref_subset = adata[adata.obs[obs_column] == ref]
+        # Create combined mask for methbases
+        combined_mask = None
+        for methbase in methbases:
+            mask = {
+                "a": ref_subset.var[f"{ref}_strand_FASTA_base"] == "A",
+                "gpc": ref_subset.var[f"{ref}_GpC_site"] == True,
+                "cpg": ref_subset.var[f"{ref}_CpG_site"] == True
+            }[methbase.lower()]
+            combined_mask = mask if combined_mask is None else combined_mask | mask
+            methbase_subset = ref_subset[:, mask]
+            matrix = methbase_subset.layers[layer] if layer else methbase_subset.X
+            for i, raw_read in enumerate(matrix):
+                read = [int(x) if not np.isnan(x) else np.random.choice([0, 1]) for x in raw_read]
+                tensor_read = torch.tensor(read, dtype=torch.long, device=device).unsqueeze(0).unsqueeze(-1)
+                coords = methbase_subset.var_names
+                for key, fs in feature_sets.items():
+                    if key == 'cpg':
+                        continue
+                    state_target = fs["state"]
+                    feature_map = fs["features"]
+                    classifications = classify_features(tensor_read, model, coords, feature_map, target_state=state_target)
+                    idx = methbase_subset.obs.index[i]
+                    for start, length, label, prob in classifications:
+                        adata.obs.at[idx, f"{methbase}_{label}"].append([start, length, prob])
+                        adata.obs.at[idx, f"{methbase}_all_{key}_features"].append([start, length, prob])
+        # Combined methbase subset
+        combined_subset = ref_subset[:, combined_mask]
+        combined_matrix = combined_subset.layers[layer] if layer else combined_subset.X
+        for i, raw_read in enumerate(combined_matrix):
+            read = [int(x) if not np.isnan(x) else np.random.choice([0, 1]) for x in raw_read]
+            tensor_read = torch.tensor(read, dtype=torch.long, device=device).unsqueeze(0).unsqueeze(-1)
+            coords = combined_subset.var_names
+            for key, fs in feature_sets.items():
+                if key == 'cpg':
+                    continue
+                state_target = fs["state"]
+                feature_map = fs["features"]
+                classifications = classify_features(tensor_read, model, coords, feature_map, target_state=state_target)
+                idx = combined_subset.obs.index[i]
+                for start, length, label, prob in classifications:
+                    adata.obs.at[idx, f"{combined_prefix}_{label}"].append([start, length, prob])
+                    adata.obs.at[idx, f"{combined_prefix}_all_{key}_features"].append([start, length, prob])
+    # --- Special handling for CpG ---
+    if cpg:
+        for ref in tqdm(references, desc="Processing CpG"):
+            ref_subset = adata[adata.obs[obs_column] == ref]
+            mask = (ref_subset.var[f"{ref}_CpG_site"] == True)
+            cpg_subset = ref_subset[:, mask]
+            matrix = cpg_subset.layers[layer] if layer else cpg_subset.X
+            for i, raw_read in enumerate(matrix):
+                read = [int(x) if not np.isnan(x) else np.random.choice([0, 1]) for x in raw_read]
+                tensor_read = torch.tensor(read, dtype=torch.long, device=device).unsqueeze(0).unsqueeze(-1)
+                coords = cpg_subset.var_names
+                fs = feature_sets['cpg']
+                state_target = fs["state"]
+                feature_map = fs["features"]
+                classifications = classify_features(tensor_read, model, coords, feature_map, target_state=state_target)
+                idx = cpg_subset.obs.index[i]
+                for start, length, label, prob in classifications:
+                    adata.obs.at[idx, f"CpG_{label}"].append([start, length, prob])
+                    adata.obs.at[idx, f"CpG_all_cpg_features"].append([start, length, prob])
+    # --- Binarization + Distance ---
+    for feature in tqdm(all_features, desc="Finalizing Layers"):
+        bin_matrix = np.zeros((adata.shape[0], adata.shape[1]), dtype=int)
+        counts = np.zeros(adata.shape[0], dtype=int)
+        for row_idx, intervals in enumerate(adata.obs[feature]):
+            if not isinstance(intervals, list):
+                intervals = []
+            for start, length, prob in intervals:
+                if prob > threshold:
+                    bin_matrix[row_idx, start:start+length] = 1
+                    counts[row_idx] += 1
+        adata.layers[f"{feature}"] = bin_matrix
+        adata.obs[f"n_{feature}"] = counts
+        adata.obs[f"{feature}_distances"] = adata.obs[feature].apply(lambda x: calculate_distances(x, threshold))
+def calculate_distances(intervals, threshold=0.9):
+    """Calculates distances between consecutive features in a read."""
+    intervals = sorted([iv for iv in intervals if iv[2] > threshold], key=lambda x: x[0])
+    distances = [(intervals[i + 1][0] - (intervals[i][0] + intervals[i][1]))
+                 for i in range(len(intervals) - 1)]
+    return distances
+def classify_features(sequence, model, coordinates, classification_mapping={}, target_state="Methylated"):
+    """
+    Classifies regions based on HMM state.
+    Parameters:
+        sequence (torch.Tensor): Tensor of binarized data [batch_size, seq_len, 1]
+        model: Trained pomegranate HMM
+        coordinates (list): Genomic coordinates for sequence
+        classification_mapping (dict): Mapping for feature labeling
+        target_state (str): The state to classify ("Methylated" or "Non-Methylated")
+    """
+    predicted_states = model.predict(sequence).squeeze(-1).squeeze(0).cpu().numpy()
+    probabilities = model.predict_proba(sequence).squeeze(0).cpu().numpy()
+    state_labels = ["Non-Methylated", "Methylated"]
+    classifications, current_start, current_length, current_probs = [], None, 0, []
+    for i, state_index in enumerate(predicted_states):
+        state_name = state_labels[state_index]
+        state_prob = probabilities[i][state_index]
+        if state_name == target_state:
+            if current_start is None:
+                current_start = i
+            current_length += 1
+            current_probs.append(state_prob)
+        elif current_start is not None:
+            classifications.append((current_start, current_length, avg := np.mean(current_probs)))
+            current_start, current_length, current_probs = None, 0, []
+    if current_start is not None:
+        classifications.append((current_start, current_length, avg := np.mean(current_probs)))
+    final = []
+    for start, length, prob in classifications:
+        feature_length = int(coordinates[start + length - 1]) - int(coordinates[start]) + 1
+        label = next((ftype for ftype, rng in classification_mapping.items() if rng[0] <= feature_length < rng[1]), target_state)
+        final.append((int(coordinates[start]) + 1, feature_length, label, prob))
+    return final

smftools/tools/apply_hmm_batched.py ADDED Viewed

@@ -0,0 +1,241 @@
+import numpy as np
+import pandas as pd
+import torch
+from tqdm import tqdm
+def apply_hmm_batched(adata, model, obs_column, layer=None, footprints=True, accessible_patches=False, cpg=False, methbases=["GpC", "CpG", "A"], device="cpu", threshold=0.7):
+    """
+    Applies an HMM model to an AnnData object using tensor-based sequence inputs.
+    If multiple methbases are passed, generates a combined feature set.
+    """
+    import numpy as np
+    import torch
+    from tqdm import tqdm
+    model.to(device)
+    # --- Feature Definitions ---
+    feature_sets = {}
+    if footprints:
+        feature_sets["footprint"] = {
+            "features": {
+                "small_bound_stretch": [0, 20],
+                "medium_bound_stretch": [20, 50],
+                "putative_nucleosome": [50, 200],
+                "large_bound_stretch": [200, np.inf]
+            },
+            "state": "Non-Methylated"
+        }
+    if accessible_patches:
+        feature_sets["accessible"] = {
+            "features": {
+                "small_accessible_patch": [0, 20],
+                "mid_accessible_patch": [20, 80],
+                "large_accessible_patch": [80, np.inf]
+            },
+            "state": "Methylated"
+        }
+    if cpg:
+        feature_sets["cpg"] = {
+            "features": {
+                "cpg_patch": [0, np.inf]
+            },
+            "state": "Methylated"
+        }
+    # --- Init columns ---
+    all_features = []
+    combined_prefix = "Combined"
+    for key, fs in feature_sets.items():
+        if key == 'cpg':
+            all_features += [f"CpG_{f}" for f in fs["features"]]
+            all_features.append(f"CpG_all_{key}_features")
+        else:
+            for methbase in methbases:
+                all_features += [f"{methbase}_{f}" for f in fs["features"]]
+                all_features.append(f"{methbase}_all_{key}_features")
+            if len(methbases) > 1:
+                all_features += [f"{combined_prefix}_{f}" for f in fs["features"]]
+                all_features.append(f"{combined_prefix}_all_{key}_features")
+    for feature in all_features:
+        adata.obs[feature] = [[] for _ in range(adata.shape[0])]
+        adata.obs[f"{feature}_distances"] = [None] * adata.shape[0]
+        adata.obs[f"n_{feature}"] = -1
+    # --- Main loop ---
+    references = adata.obs[obs_column].cat.categories
+    for ref in tqdm(references, desc="Processing References"):
+        ref_subset = adata[adata.obs[obs_column] == ref]
+        # Combined methbase mask
+        combined_mask = None
+        for methbase in methbases:
+            mask = {
+                "a": ref_subset.var[f"{ref}_strand_FASTA_base"] == "A",
+                "gpc": ref_subset.var[f"{ref}_GpC_site"] == True,
+                "cpg": ref_subset.var[f"{ref}_CpG_site"] == True
+            }[methbase.lower()]
+            combined_mask = mask if combined_mask is None else combined_mask | mask
+            methbase_subset = ref_subset[:, mask]
+            matrix = methbase_subset.layers[layer] if layer else methbase_subset.X
+            processed_reads = [[int(x) if not np.isnan(x) else np.random.choice([0, 1]) for x in read] for read in matrix]
+            tensor_batch = torch.tensor(processed_reads, dtype=torch.long, device=device).unsqueeze(-1)
+            coords = methbase_subset.var_names
+            for key, fs in feature_sets.items():
+                if key == 'cpg':
+                    continue
+                state_target = fs["state"]
+                feature_map = fs["features"]
+                pred_states = model.predict(tensor_batch)
+                probs = model.predict_proba(tensor_batch)
+                classifications = classify_batch(pred_states, probs, coords, feature_map, target_state=state_target)
+                for i, idx in enumerate(methbase_subset.obs.index):
+                    for start, length, label, prob in classifications[i]:
+                        adata.obs.at[idx, f"{methbase}_{label}"].append([start, length, prob])
+                        adata.obs.at[idx, f"{methbase}_all_{key}_features"].append([start, length, prob])
+        # Combined subset
+        if len(methbases) > 1:
+            combined_subset = ref_subset[:, combined_mask]
+            combined_matrix = combined_subset.layers[layer] if layer else combined_subset.X
+            processed_combined_reads = [[int(x) if not np.isnan(x) else np.random.choice([0, 1]) for x in read] for read in combined_matrix]
+            tensor_combined_batch = torch.tensor(processed_combined_reads, dtype=torch.long, device=device).unsqueeze(-1)
+            coords = combined_subset.var_names
+            for key, fs in feature_sets.items():
+                if key == 'cpg':
+                    continue
+                state_target = fs["state"]
+                feature_map = fs["features"]
+                pred_states = model.predict(tensor_combined_batch)
+                probs = model.predict_proba(tensor_combined_batch)
+                classifications = classify_batch(pred_states, probs, coords, feature_map, target_state=state_target)
+                for i, idx in enumerate(combined_subset.obs.index):
+                    for start, length, label, prob in classifications[i]:
+                        adata.obs.at[idx, f"{combined_prefix}_{label}"].append([start, length, prob])
+                        adata.obs.at[idx, f"{combined_prefix}_all_{key}_features"].append([start, length, prob])
+    # --- Special handling for CpG ---
+    if cpg:
+        for ref in tqdm(references, desc="Processing CpG"):
+            ref_subset = adata[adata.obs[obs_column] == ref]
+            mask = (ref_subset.var[f"{ref}_CpG_site"] == True)
+            cpg_subset = ref_subset[:, mask]
+            matrix = cpg_subset.layers[layer] if layer else cpg_subset.X
+            processed_reads = [[int(x) if not np.isnan(x) else np.random.choice([0, 1]) for x in read] for read in matrix]
+            tensor_batch = torch.tensor(processed_reads, dtype=torch.long, device=device).unsqueeze(-1)
+            coords = cpg_subset.var_names
+            fs = feature_sets['cpg']
+            state_target = fs["state"]
+            feature_map = fs["features"]
+            pred_states = model.predict(tensor_batch)
+            probs = model.predict_proba(tensor_batch)
+            classifications = classify_batch(pred_states, probs, coords, feature_map, target_state=state_target)
+            for i, idx in enumerate(cpg_subset.obs.index):
+                for start, length, label, prob in classifications[i]:
+                    adata.obs.at[idx, f"CpG_{label}"].append([start, length, prob])
+                    adata.obs.at[idx, f"CpG_all_cpg_features"].append([start, length, prob])
+    # --- Binarization + Distance ---
+    for feature in tqdm(all_features, desc="Finalizing Layers"):
+        bin_matrix = np.zeros((adata.shape[0], adata.shape[1]), dtype=int)
+        counts = np.zeros(adata.shape[0], dtype=int)
+        for row_idx, intervals in enumerate(adata.obs[feature]):
+            if not isinstance(intervals, list):
+                intervals = []
+            for start, length, prob in intervals:
+                if prob > threshold:
+                    bin_matrix[row_idx, start:start+length] = 1
+                    counts[row_idx] += 1
+        adata.layers[f"{feature}"] = bin_matrix
+        adata.obs[f"n_{feature}"] = counts
+        adata.obs[f"{feature}_distances"] = calculate_batch_distances(adata.obs[feature].tolist(), threshold)
+def calculate_batch_distances(intervals_list, threshold=0.9):
+    """
+    Vectorized calculation of distances across multiple reads.
+    Parameters:
+        intervals_list (list of list): Outer list = reads, inner list = intervals [start, length, prob]
+        threshold (float): Minimum probability threshold for filtering
+    Returns:
+        List of distance lists per read.
+    """
+    results = []
+    for intervals in intervals_list:
+        if not isinstance(intervals, list) or len(intervals) == 0:
+            results.append([])
+            continue
+        valid = [iv for iv in intervals if iv[2] > threshold]
+        valid = sorted(valid, key=lambda x: x[0])
+        dists = [(valid[i + 1][0] - (valid[i][0] + valid[i][1])) for i in range(len(valid) - 1)]
+        results.append(dists)
+    return results
+def classify_batch(predicted_states_batch, probabilities_batch, coordinates, classification_mapping, target_state="Methylated"):
+    """
+    Classify batch sequences efficiently.
+    Parameters:
+        predicted_states_batch: Tensor [batch_size, seq_len]
+        probabilities_batch: Tensor [batch_size, seq_len, n_states]
+        coordinates: list of genomic coordinates
+        classification_mapping: dict of feature bins
+        target_state: state name ("Methylated" or "Non-Methylated")
+    Returns:
+        List of classifications for each sequence.
+    """
+    import numpy as np
+    state_labels = ["Non-Methylated", "Methylated"]
+    target_idx = state_labels.index(target_state)
+    batch_size = predicted_states_batch.shape[0]
+    all_classifications = []
+    for b in range(batch_size):
+        predicted_states = predicted_states_batch[b].cpu().numpy()
+        probabilities = probabilities_batch[b].cpu().numpy()
+        regions = []
+        current_start, current_length, current_probs = None, 0, []
+        for i, state_index in enumerate(predicted_states):
+            state_prob = probabilities[i][state_index]
+            if state_index == target_idx:
+                if current_start is None:
+                    current_start = i
+                current_length += 1
+                current_probs.append(state_prob)
+            elif current_start is not None:
+                regions.append((current_start, current_length, np.mean(current_probs)))
+                current_start, current_length, current_probs = None, 0, []
+        if current_start is not None:
+            regions.append((current_start, current_length, np.mean(current_probs)))
+        final = []
+        for start, length, prob in regions:
+            feature_length = int(coordinates[start + length - 1]) - int(coordinates[start]) + 1
+            label = next((ftype for ftype, rng in classification_mapping.items() if rng[0] <= feature_length < rng[1]), target_state)
+            final.append((int(coordinates[start]) + 1, feature_length, label, prob))
+        all_classifications.append(final)
+    return all_classifications

smftools 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

smftools 0.1.6py3-none-any.whl → 0.1.7py3-none-any.whl