PyPI - smftools - Versions diffs - 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

smftools 0.1.7py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (174) hide show

smftools/__init__.py +7 -6
smftools/_version.py +1 -1
smftools/cli/cli_flows.py +94 -0
smftools/cli/hmm_adata.py +338 -0
smftools/cli/load_adata.py +577 -0
smftools/cli/preprocess_adata.py +363 -0
smftools/cli/spatial_adata.py +564 -0
smftools/cli_entry.py +435 -0
smftools/config/__init__.py +1 -0
smftools/config/conversion.yaml +38 -0
smftools/config/deaminase.yaml +61 -0
smftools/config/default.yaml +264 -0
smftools/config/direct.yaml +41 -0
smftools/config/discover_input_files.py +115 -0
smftools/config/experiment_config.py +1288 -0
smftools/hmm/HMM.py +1576 -0
smftools/hmm/__init__.py +20 -0
smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
smftools/hmm/call_hmm_peaks.py +106 -0
smftools/{tools → hmm}/display_hmm.py +3 -3
smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
smftools/{tools → hmm}/train_hmm.py +1 -1
smftools/informatics/__init__.py +13 -9
smftools/informatics/archived/deaminase_smf.py +132 -0
smftools/informatics/archived/fast5_to_pod5.py +43 -0
smftools/informatics/archived/helpers/archived/__init__.py +71 -0
smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +30 -4
smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +5 -4
smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +8 -7
smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +8 -12
smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
smftools/informatics/bam_functions.py +812 -0
smftools/informatics/basecalling.py +67 -0
smftools/informatics/bed_functions.py +366 -0
smftools/informatics/binarize_converted_base_identities.py +172 -0
smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +198 -50
smftools/informatics/fasta_functions.py +255 -0
smftools/informatics/h5ad_functions.py +197 -0
smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +147 -61
smftools/informatics/modkit_functions.py +129 -0
smftools/informatics/ohe.py +160 -0
smftools/informatics/pod5_functions.py +224 -0
smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
smftools/machine_learning/__init__.py +12 -0
smftools/machine_learning/data/__init__.py +2 -0
smftools/machine_learning/data/anndata_data_module.py +234 -0
smftools/machine_learning/evaluation/__init__.py +2 -0
smftools/machine_learning/evaluation/eval_utils.py +31 -0
smftools/machine_learning/evaluation/evaluators.py +223 -0
smftools/machine_learning/inference/__init__.py +3 -0
smftools/machine_learning/inference/inference_utils.py +27 -0
smftools/machine_learning/inference/lightning_inference.py +68 -0
smftools/machine_learning/inference/sklearn_inference.py +55 -0
smftools/machine_learning/inference/sliding_window_inference.py +114 -0
smftools/machine_learning/models/base.py +295 -0
smftools/machine_learning/models/cnn.py +138 -0
smftools/machine_learning/models/lightning_base.py +345 -0
smftools/machine_learning/models/mlp.py +26 -0
smftools/{tools → machine_learning}/models/positional.py +3 -2
smftools/{tools → machine_learning}/models/rnn.py +2 -1
smftools/machine_learning/models/sklearn_models.py +273 -0
smftools/machine_learning/models/transformer.py +303 -0
smftools/machine_learning/training/__init__.py +2 -0
smftools/machine_learning/training/train_lightning_model.py +135 -0
smftools/machine_learning/training/train_sklearn_model.py +114 -0
smftools/plotting/__init__.py +4 -1
smftools/plotting/autocorrelation_plotting.py +609 -0
smftools/plotting/general_plotting.py +1292 -140
smftools/plotting/hmm_plotting.py +260 -0
smftools/plotting/qc_plotting.py +270 -0
smftools/preprocessing/__init__.py +15 -8
smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
smftools/preprocessing/append_base_context.py +122 -0
smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
smftools/preprocessing/binarize.py +17 -0
smftools/preprocessing/binarize_on_Youden.py +2 -2
smftools/preprocessing/calculate_complexity_II.py +248 -0
smftools/preprocessing/calculate_coverage.py +10 -1
smftools/preprocessing/calculate_position_Youden.py +1 -1
smftools/preprocessing/calculate_read_modification_stats.py +101 -0
smftools/preprocessing/clean_NaN.py +17 -1
smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
smftools/preprocessing/flag_duplicate_reads.py +1326 -124
smftools/preprocessing/invert_adata.py +12 -5
smftools/preprocessing/load_sample_sheet.py +19 -4
smftools/readwrite.py +1021 -89
smftools/tools/__init__.py +3 -32
smftools/tools/calculate_umap.py +5 -5
smftools/tools/general_tools.py +3 -3
smftools/tools/position_stats.py +468 -106
smftools/tools/read_stats.py +115 -1
smftools/tools/spatial_autocorrelation.py +562 -0
{smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/METADATA +14 -9
smftools-0.2.3.dist-info/RECORD +173 -0
smftools-0.2.3.dist-info/entry_points.txt +2 -0
smftools/informatics/fast5_to_pod5.py +0 -21
smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
smftools/informatics/helpers/__init__.py +0 -74
smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
smftools/informatics/helpers/bam_qc.py +0 -66
smftools/informatics/helpers/bed_to_bigwig.py +0 -39
smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
smftools/informatics/helpers/index_fasta.py +0 -12
smftools/informatics/helpers/make_dirs.py +0 -21
smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
smftools/informatics/load_adata.py +0 -182
smftools/informatics/readwrite.py +0 -106
smftools/informatics/subsample_fasta_from_bed.py +0 -47
smftools/preprocessing/append_C_context.py +0 -82
smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
smftools/preprocessing/filter_reads_on_length.py +0 -51
smftools/tools/call_hmm_peaks.py +0 -105
smftools/tools/data/__init__.py +0 -2
smftools/tools/data/anndata_data_module.py +0 -90
smftools/tools/inference/__init__.py +0 -1
smftools/tools/inference/lightning_inference.py +0 -41
smftools/tools/models/base.py +0 -14
smftools/tools/models/cnn.py +0 -34
smftools/tools/models/lightning_base.py +0 -41
smftools/tools/models/mlp.py +0 -17
smftools/tools/models/sklearn_models.py +0 -40
smftools/tools/models/transformer.py +0 -133
smftools/tools/training/__init__.py +0 -1
smftools/tools/training/train_lightning_model.py +0 -47
smftools-0.1.7.dist-info/RECORD +0 -136
/smftools/{tools/evaluation → cli}/__init__.py +0 -0
/smftools/{tools → hmm}/calculate_distances.py +0 -0
/smftools/{tools → hmm}/hmm_readwrite.py +0 -0
/smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
/smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
/smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
/smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
/smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
/smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
/smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
/smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
/smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
/smftools/{tools → machine_learning}/models/__init__.py +0 -0
/smftools/{tools → machine_learning}/models/wrappers.py +0 -0
/smftools/{tools → machine_learning}/utils/__init__.py +0 -0
/smftools/{tools → machine_learning}/utils/device.py +0 -0
/smftools/{tools → machine_learning}/utils/grl.py +0 -0
/smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
/smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
{smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
{smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0

smftools/preprocessing/add_read_length_and_mapping_qc.py ADDED Viewed

@@ -0,0 +1,129 @@
+import numpy as np
+import pandas as pd
+import scipy.sparse as sp
+from typing import Optional, List, Dict, Union
+def add_read_length_and_mapping_qc(
+    adata,
+    bam_files: Optional[List[str]] = None,
+    read_metrics: Optional[Dict[str, Union[list, tuple]]] = None,
+    uns_flag: str = "read_lenth_and_mapping_qc_performed",
+    extract_read_features_from_bam_callable = None,
+    bypass: bool = False,
+    force_redo: bool = True
+):
+    """
+    Populate adata.obs with read/mapping QC columns.
+    Parameters
+    ----------
+    adata
+        AnnData to annotate (modified in-place).
+    bam_files
+        Optional list of BAM files to extract metrics from. Ignored if read_metrics supplied.
+    read_metrics
+        Optional dict mapping obs_name -> [read_length, read_quality, reference_length, mapped_length, mapping_quality]
+        If provided, this will be used directly and bam_files will be ignored.
+    uns_flag
+        key in final_adata.uns used to record that QC was performed (kept the name with original misspelling).
+    extract_read_features_from_bam_callable
+        Optional callable(bam_path) -> dict mapping read_name -> list/tuple of metrics.
+        If not provided and bam_files is given, function will attempt to call `extract_read_features_from_bam`
+        from the global namespace (your existing helper).
+    Returns
+    -------
+    None (mutates final_adata in-place)
+    """
+    # Only run if not already performed
+    already = bool(adata.uns.get(uns_flag, False))
+    if (already and not force_redo) or bypass:
+        # QC already performed; nothing to do
+        return
+    # Build read_metrics dict either from provided arg or by extracting from bam files
+    if read_metrics is None:
+        read_metrics = {}
+        if bam_files:
+            extractor = extract_read_features_from_bam_callable or globals().get("extract_read_features_from_bam")
+            if extractor is None:
+                raise ValueError("No `read_metrics` provided and `extract_read_features_from_bam` not found.")
+            for bam in bam_files:
+                bam_read_metrics = extractor(bam)
+                if not isinstance(bam_read_metrics, dict):
+                    raise ValueError(f"extract_read_features_from_bam returned non-dict for {bam}")
+                read_metrics.update(bam_read_metrics)
+        else:
+            # nothing to do
+            read_metrics = {}
+    # Convert read_metrics dict -> DataFrame (rows = read id)
+    # Values may be lists/tuples or scalars; prefer lists/tuples with 5 entries.
+    if len(read_metrics) == 0:
+        # fill with NaNs
+        n = adata.n_obs
+        adata.obs['read_length'] = np.full(n, np.nan)
+        adata.obs['mapped_length'] = np.full(n, np.nan)
+        adata.obs['reference_length'] = np.full(n, np.nan)
+        adata.obs['read_quality'] = np.full(n, np.nan)
+        adata.obs['mapping_quality'] = np.full(n, np.nan)
+    else:
+        # Build DF robustly
+        # Convert values to lists where possible, else to [val, val, val...]
+        max_cols = 5
+        rows = {}
+        for k, v in read_metrics.items():
+            if isinstance(v, (list, tuple, np.ndarray)):
+                vals = list(v)
+            else:
+                # scalar -> replicate into 5 columns to preserve original behavior
+                vals = [v] * max_cols
+            # Ensure length >= 5
+            if len(vals) < max_cols:
+                vals = vals + [np.nan] * (max_cols - len(vals))
+            rows[k] = vals[:max_cols]
+        df = pd.DataFrame.from_dict(rows, orient='index', columns=[
+            'read_length', 'read_quality', 'reference_length', 'mapped_length', 'mapping_quality'
+        ])
+        # Reindex to final_adata.obs_names so order matches adata
+        # If obs_names are not present as keys in df, the results will be NaN
+        df_reindexed = df.reindex(adata.obs_names).astype(float)
+        adata.obs['read_length'] = df_reindexed['read_length'].values
+        adata.obs['mapped_length'] = df_reindexed['mapped_length'].values
+        adata.obs['reference_length'] = df_reindexed['reference_length'].values
+        adata.obs['read_quality'] = df_reindexed['read_quality'].values
+        adata.obs['mapping_quality'] = df_reindexed['mapping_quality'].values
+    # Compute ratio columns safely (avoid divide-by-zero and preserve NaN)
+    # read_length_to_reference_length_ratio
+    rl = pd.to_numeric(adata.obs['read_length'], errors='coerce').to_numpy(dtype=float)
+    ref_len = pd.to_numeric(adata.obs['reference_length'], errors='coerce').to_numpy(dtype=float)
+    mapped_len = pd.to_numeric(adata.obs['mapped_length'], errors='coerce').to_numpy(dtype=float)
+    # safe divisions: use np.where to avoid warnings and replace inf with nan
+    with np.errstate(divide='ignore', invalid='ignore'):
+        rl_to_ref = np.where((ref_len != 0) & np.isfinite(ref_len), rl / ref_len, np.nan)
+        mapped_to_ref = np.where((ref_len != 0) & np.isfinite(ref_len), mapped_len / ref_len, np.nan)
+        mapped_to_read = np.where((rl != 0) & np.isfinite(rl), mapped_len / rl, np.nan)
+    adata.obs['read_length_to_reference_length_ratio'] = rl_to_ref
+    adata.obs['mapped_length_to_reference_length_ratio'] = mapped_to_ref
+    adata.obs['mapped_length_to_read_length_ratio'] = mapped_to_read
+    # Add read level raw modification signal: sum over X rows
+    X = adata.X
+    if sp.issparse(X):
+        # sum returns (n_obs, 1) sparse matrix; convert to 1d array
+        raw_sig = np.asarray(X.sum(axis=1)).ravel()
+    else:
+        raw_sig = np.asarray(X.sum(axis=1)).ravel()
+    adata.obs['Raw_modification_signal'] = raw_sig
+    # mark as done
+    adata.uns[uns_flag] = True
+    return None

smftools/preprocessing/append_base_context.py ADDED Viewed

@@ -0,0 +1,122 @@
+def append_base_context(adata,
+                        obs_column='Reference_strand',
+                        use_consensus=False,
+                        native=False,
+                        mod_target_bases=['GpC', 'CpG'],
+                        bypass=False,
+                        force_redo=False,
+                        uns_flag='base_context_added'
+):
+    """
+    Adds nucleobase context to the position within the given category. When use_consensus is True, it uses the consensus sequence, otherwise it defaults to the FASTA sequence.
+    Parameters:
+        adata (AnnData): The input adata object.
+        obs_column (str): The observation column in which to stratify on. Default is 'Reference_strand', which should not be changed for most purposes.
+        use_consensus (bool): A truth statement indicating whether to use the consensus sequence from the reads mapped to the reference. If False, the reference FASTA is used instead.
+        native (bool): If False, perform conversion SMF assumptions. If True, perform native SMF assumptions
+        mod_target_bases (list): Base contexts that may be modified.
+    Returns:
+        None
+    """
+    import numpy as np
+    import anndata as ad
+    # Only run if not already performed
+    already = bool(adata.uns.get(uns_flag, False))
+    if (already and not force_redo) or bypass:
+        # QC already performed; nothing to do
+        return
+    print('Adding base context based on reference FASTA sequence for sample')
+    categories = adata.obs[obs_column].cat.categories
+    site_types = []
+    if any(base in mod_target_bases for base in ['GpC', 'CpG', 'C']):
+        site_types += ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C_site', 'C_site']
+    if 'A' in mod_target_bases:
+        site_types += ['A_site']
+    for cat in categories:
+        # Assess if the strand is the top or bottom strand converted
+        if 'top' in cat:
+            strand = 'top'
+        elif 'bottom' in cat:
+            strand = 'bottom'
+        if native:
+            basename = cat.split(f"_{strand}")[0]
+            if use_consensus:
+                sequence = adata.uns[f'{basename}_consensus_sequence']
+            else:
+                # This sequence is the unconverted FASTA sequence of the original input FASTA for the locus
+                sequence = adata.uns[f'{basename}_FASTA_sequence']
+        else:
+            basename = cat.split(f"_{strand}")[0]
+            if use_consensus:
+                sequence = adata.uns[f'{basename}_consensus_sequence']
+            else:
+                # This sequence is the unconverted FASTA sequence of the original input FASTA for the locus
+                sequence = adata.uns[f'{basename}_FASTA_sequence']
+        # Init a dict keyed by reference site type that points to a bool of whether the position is that site type.
+        boolean_dict = {}
+        for site_type in site_types:
+            boolean_dict[f'{cat}_{site_type}'] = np.full(len(sequence), False, dtype=bool)
+        if any(base in mod_target_bases for base in ['GpC', 'CpG', 'C']):
+            if strand == 'top':
+                # Iterate through the sequence and apply the criteria
+                for i in range(1, len(sequence) - 1):
+                    if sequence[i] == 'C':
+                        boolean_dict[f'{cat}_C_site'][i] = True
+                        if sequence[i - 1] == 'G' and sequence[i + 1] != 'G':
+                            boolean_dict[f'{cat}_GpC_site'][i] = True
+                        elif sequence[i - 1] == 'G' and sequence[i + 1] == 'G':
+                            boolean_dict[f'{cat}_ambiguous_GpC_CpG_site'][i] = True
+                        elif sequence[i - 1] != 'G' and sequence[i + 1] == 'G':
+                            boolean_dict[f'{cat}_CpG_site'][i] = True
+                        elif sequence[i - 1] != 'G' and sequence[i + 1] != 'G':
+                            boolean_dict[f'{cat}_other_C_site'][i] = True
+            elif strand == 'bottom':
+                # Iterate through the sequence and apply the criteria
+                for i in range(1, len(sequence) - 1):
+                    if sequence[i] == 'G':
+                        boolean_dict[f'{cat}_C_site'][i] = True
+                        if sequence[i + 1] == 'C' and sequence[i - 1] != 'C':
+                            boolean_dict[f'{cat}_GpC_site'][i] = True
+                        elif sequence[i - 1] == 'C' and sequence[i + 1] == 'C':
+                            boolean_dict[f'{cat}_ambiguous_GpC_CpG_site'][i] = True
+                        elif sequence[i - 1] == 'C' and sequence[i + 1] != 'C':
+                            boolean_dict[f'{cat}_CpG_site'][i] = True
+                        elif sequence[i - 1] != 'C' and sequence[i + 1] != 'C':
+                            boolean_dict[f'{cat}_other_C_site'][i] = True
+            else:
+                print('Error: top or bottom strand of conversion could not be determined. Ensure this value is in the Reference name.')
+        if 'A' in mod_target_bases:
+            if strand == 'top':
+                # Iterate through the sequence and apply the criteria
+                for i in range(1, len(sequence) - 1):
+                    if sequence[i] == 'A':
+                        boolean_dict[f'{cat}_A_site'][i] = True
+            elif strand == 'bottom':
+                # Iterate through the sequence and apply the criteria
+                for i in range(1, len(sequence) - 1):
+                    if sequence[i] == 'T':
+                        boolean_dict[f'{cat}_A_site'][i] = True
+            else:
+                print('Error: top or bottom strand of conversion could not be determined. Ensure this value is in the Reference name.')
+        for site_type in site_types:
+            adata.var[f'{cat}_{site_type}'] = boolean_dict[f'{cat}_{site_type}'].astype(bool)
+            if native:
+                adata.obsm[f'{cat}_{site_type}'] = adata[:, adata.var[f'{cat}_{site_type}'] == True].layers['binarized_methylation']
+            else:
+                adata.obsm[f'{cat}_{site_type}'] = adata[:, adata.var[f'{cat}_{site_type}'] == True].X
+    # mark as done
+    adata.uns[uns_flag] = True
+    return None

smftools/preprocessing/append_binary_layer_by_base_context.py ADDED Viewed

@@ -0,0 +1,143 @@
+import numpy as np
+import scipy.sparse as sp
+def append_binary_layer_by_base_context(
+    adata,
+    reference_column: str,
+    smf_modality: str = "conversion",
+    verbose: bool = True,
+    uns_flag: str = "binary_layers_by_base_context_added",
+    bypass: bool = False,
+    force_redo: bool = False
+):
+    """
+    Build per-reference C/G-site masked layers:
+      - GpC_site_binary
+      - CpG_site_binary
+      - GpC_CpG_combined_site_binary (numeric sum where present; NaN where neither present)
+      - C_site_binary
+      - other_C_site_binary
+    Behavior:
+      - If X is sparse it will be converted to dense for these layers (keeps original adata.X untouched).
+      - Missing var columns are warned about but do not crash.
+      - Masked positions are filled with np.nan to make masked vs unmasked explicit.
+      - Requires append_base_context to be run first
+    """
+    # Only run if not already performed
+    already = bool(adata.uns.get(uns_flag, False))
+    if (already and not force_redo) or bypass or ("base_context_added" not in adata.uns):
+        # QC already performed; nothing to do
+        return adata
+    # check inputs
+    if reference_column not in adata.obs.columns:
+        raise KeyError(f"reference_column '{reference_column}' not found in adata.obs")
+    # modality flag (kept for your potential use)
+    if smf_modality != "direct":
+        if smf_modality == "conversion":
+            deaminase = False
+        else:
+            deaminase = True
+    else:
+        deaminase = None  # unused but preserved
+    # expected per-reference var column names
+    references = adata.obs[reference_column].astype("category").cat.categories
+    reference_to_gpc_column = {ref: f"{ref}_GpC_site" for ref in references}
+    reference_to_cpg_column = {ref: f"{ref}_CpG_site" for ref in references}
+    reference_to_c_column = {ref: f"{ref}_C_site" for ref in references}
+    reference_to_other_c_column = {ref: f"{ref}_other_C_site" for ref in references}
+    # verify var columns exist and build boolean masks per ref (len = n_vars)
+    n_obs, n_vars = adata.shape
+    def _col_mask_or_warn(colname):
+        if colname not in adata.var.columns:
+            if verbose:
+                print(f"Warning: var column '{colname}' not found; treating as all-False mask.")
+            return np.zeros(n_vars, dtype=bool)
+        vals = adata.var[colname].values
+        # coerce truthiness
+        try:
+            return vals.astype(bool)
+        except Exception:
+            return np.array([bool(v) for v in vals], dtype=bool)
+    gpc_var_masks = {ref: _col_mask_or_warn(col) for ref, col in reference_to_gpc_column.items()}
+    cpg_var_masks = {ref: _col_mask_or_warn(col) for ref, col in reference_to_cpg_column.items()}
+    c_var_masks =   {ref: _col_mask_or_warn(col) for ref, col in reference_to_c_column.items()}
+    other_c_var_masks = {ref: _col_mask_or_warn(col) for ref, col in reference_to_other_c_column.items()}
+    # prepare X as dense float32 for layer filling (we leave adata.X untouched)
+    X = adata.X
+    if sp.issparse(X):
+        if verbose:
+            print("Converting sparse X to dense array for layer construction (temporary).")
+        X = X.toarray()
+    X = np.asarray(X, dtype=np.float32)
+    # initialize masked arrays filled with NaN
+    masked_gpc = np.full((n_obs, n_vars), np.nan, dtype=np.float32)
+    masked_cpg = np.full((n_obs, n_vars), np.nan, dtype=np.float32)
+    masked_any_c = np.full((n_obs, n_vars), np.nan, dtype=np.float32)
+    masked_other_c = np.full((n_obs, n_vars), np.nan, dtype=np.float32)
+    # fill row-blocks per reference (this avoids creating a full row×var boolean mask)
+    obs_ref_series = adata.obs[reference_column]
+    for ref in references:
+        rows_mask = (obs_ref_series.values == ref)
+        if not rows_mask.any():
+            continue
+        row_idx = np.nonzero(rows_mask)[0]  # integer indices of rows for this ref
+        # column masks for this ref
+        gpc_cols = gpc_var_masks.get(ref, np.zeros(n_vars, dtype=bool))
+        cpg_cols = cpg_var_masks.get(ref, np.zeros(n_vars, dtype=bool))
+        c_cols   = c_var_masks.get(ref, np.zeros(n_vars, dtype=bool))
+        other_c_cols = other_c_var_masks.get(ref, np.zeros(n_vars, dtype=bool))
+        if gpc_cols.any():
+            # assign only the submatrix (rows x selected cols)
+            masked_gpc[np.ix_(row_idx, gpc_cols)] = X[np.ix_(row_idx, gpc_cols)]
+        if cpg_cols.any():
+            masked_cpg[np.ix_(row_idx, cpg_cols)] = X[np.ix_(row_idx, cpg_cols)]
+        if c_cols.any():
+            masked_any_c[np.ix_(row_idx, c_cols)] = X[np.ix_(row_idx, c_cols)]
+        if other_c_cols.any():
+            masked_other_c[np.ix_(row_idx, other_c_cols)] = X[np.ix_(row_idx, other_c_cols)]
+    # Build combined layer:
+    # - numeric_sum: sum where either exists, NaN where neither exists
+    #   we compute numeric sum but preserve NaN where both are NaN
+    gpc_nan = np.isnan(masked_gpc)
+    cpg_nan = np.isnan(masked_cpg)
+    combined_sum = np.nan_to_num(masked_gpc, nan=0.0) + np.nan_to_num(masked_cpg, nan=0.0)
+    both_nan = gpc_nan & cpg_nan
+    combined_sum[both_nan] = np.nan
+    # Alternative: if you prefer a boolean OR combined layer, uncomment:
+    # combined_bool = (~gpc_nan & (masked_gpc != 0)) | (~cpg_nan & (masked_cpg != 0))
+    # combined_layer = combined_bool.astype(np.float32)
+    adata.layers['GpC_site_binary'] = masked_gpc
+    adata.layers['CpG_site_binary'] = masked_cpg
+    adata.layers['GpC_CpG_combined_site_binary'] = combined_sum
+    adata.layers['C_site_binary'] = masked_any_c
+    adata.layers['other_C_site_binary'] = masked_other_c
+    if verbose:
+        def _filled_positions(arr):
+            return int(np.sum(~np.isnan(arr)))
+        print("Layer build summary (non-NaN cell counts):")
+        print(f"  GpC: {_filled_positions(masked_gpc)}")
+        print(f"  CpG: {_filled_positions(masked_cpg)}")
+        print(f"  GpC+CpG combined: {_filled_positions(combined_sum)}")
+        print(f"  C: {_filled_positions(masked_any_c)}")
+        print(f"  other_C: {_filled_positions(masked_other_c)}")
+    # mark as done
+    adata.uns[uns_flag] = True
+    return adata

smftools/preprocessing/binarize.py ADDED Viewed

@@ -0,0 +1,17 @@
+import numpy as np
+def binarize_adata(adata, source="X", target_layer="binary", threshold=0.8):
+    """
+    Binarize a dense matrix and preserve NaN.
+    source: "X" or layer name
+    """
+    X = adata.X if source == "X" else adata.layers[source]
+    # Copy to avoid modifying original in-place
+    X_bin = X.copy()
+    # Where not NaN: apply threshold
+    mask = ~np.isnan(X_bin)
+    X_bin[mask] = (X_bin[mask] > threshold).astype(np.int8)
+    adata.layers[target_layer] = X_bin

smftools/preprocessing/binarize_on_Youden.py CHANGED Viewed

@@ -1,4 +1,4 @@
-def binarize_on_Youden(adata, obs_column='Reference'):
+def binarize_on_Youden(adata, obs_column='Reference', output_layer_name='binarized_methylation'):
     """
     Binarize SMF values based on position thresholds determined by calculate_position_Youden.
@@ -42,4 +42,4 @@ def binarize_on_Youden(adata, obs_column='Reference'):
         binarized_methylation[cat_mask, :] = binarized_matrix
     # Store the binarized matrix in a new layer
-    adata.layers['binarized_methylation'] = binarized_methylation
+    adata.layers[output_layer_name] = binarized_methylation

smftools/preprocessing/calculate_complexity_II.py ADDED Viewed

@@ -0,0 +1,248 @@
+from typing import Optional
+def calculate_complexity_II(
+    adata,
+    output_directory='',
+    sample_col='Sample_names',
+    ref_col: Optional[str] = 'Reference_strand',
+    cluster_col='sequence__merged_cluster_id',
+    plot=True,
+    save_plot=False,
+    n_boot=30,
+    n_depths=12,
+    random_state=0,
+    csv_summary=True,
+    uns_flag='complexity_analysis_complete',
+    force_redo=False,
+    bypass=False
+):
+    """
+    Estimate and plot library complexity.
+    If ref_col is None (default), behaves as before: one calculation per sample.
+    If ref_col is provided, computes complexity for each (sample, ref) pair.
+    Results:
+      - adata.uns['Library_complexity_results'] : dict keyed by (sample,) or (sample, ref) -> dict with fields
+          C0, n_reads, n_unique, depths, mean_unique, ci_low, ci_high
+      - Also stores per-entity record in adata.uns[f'Library_complexity_{sanitized_name}'] (backwards compatible)
+      - Optionally saves PNGs and CSVs (curve points + fit summary)
+    """
+    import os
+    import numpy as np
+    import pandas as pd
+    import matplotlib.pyplot as plt
+    from scipy.optimize import curve_fit
+    from datetime import datetime
+    # early exits
+    already = bool(adata.uns.get(uns_flag, False))
+    if (already and not force_redo):
+        return None
+    if bypass:
+        return None
+    rng = np.random.default_rng(random_state)
+    def lw(x, C0):
+        return C0 * (1.0 - np.exp(-x / C0))
+    def sanitize(name: str) -> str:
+        return "".join(c if c.isalnum() or c in "-._" else "_" for c in str(name))
+    # checks
+    for col in (sample_col, cluster_col):
+        if col not in adata.obs.columns:
+            raise KeyError(f"Required column '{col}' not found in adata.obs")
+    if ref_col is not None and ref_col not in adata.obs.columns:
+        raise KeyError(f"ref_col '{ref_col}' not found in adata.obs")
+    if save_plot or csv_summary:
+        os.makedirs(output_directory or ".", exist_ok=True)
+    # containers to collect CSV rows across all groups
+    fit_records = []
+    curve_records = []
+    # output dict stored centrally
+    results = {}
+    # build list of groups: either samples only, or (sample, ref) pairs
+    sseries = adata.obs[sample_col].astype("category")
+    samples = list(sseries.cat.categories)
+    if ref_col is None:
+        group_keys = [(s,) for s in samples]
+    else:
+        rseries = adata.obs[ref_col].astype("category")
+        references = list(rseries.cat.categories)
+        group_keys = []
+        # iterate only pairs that exist in data to avoid empty processing
+        for s in samples:
+            mask_s = (adata.obs[sample_col] == s)
+            # find references present for this sample
+            ref_present = pd.Categorical(adata.obs.loc[mask_s, ref_col]).categories
+            # Use intersection of known reference categories and those present for sample
+            for r in ref_present:
+                group_keys.append((s, r))
+    # iterate groups
+    for g in group_keys:
+        if ref_col is None:
+            sample = g[0]
+            # filter mask
+            mask = (adata.obs[sample_col] == sample).values
+            group_label = f"{sample}"
+        else:
+            sample, ref = g
+            mask = (adata.obs[sample_col] == sample) & (adata.obs[ref_col] == ref)
+            group_label = f"{sample}__{ref}"
+        n_reads = int(mask.sum())
+        if n_reads < 2:
+            # store empty placeholders and continue
+            results[g] = {
+                "C0": np.nan,
+                "n_reads": int(n_reads),
+                "n_unique": 0,
+                "depths": np.array([], dtype=int),
+                "mean_unique": np.array([], dtype=float),
+                "ci_low": np.array([], dtype=float),
+                "ci_high": np.array([], dtype=float),
+            }
+            # also store back-compat key
+            adata.uns[f'Library_complexity_{sanitize(group_label)}'] = results[g]
+            continue
+        # cluster ids array for this group
+        clusters = adata.obs.loc[mask, cluster_col].to_numpy()
+        # observed unique molecules at full depth
+        observed_unique = int(pd.unique(clusters).size)
+        # choose subsampling depths
+        if n_depths < 2:
+            depths = np.array([n_reads], dtype=int)
+        else:
+            lo = max(10, int(0.05 * n_reads))
+            depths = np.unique(np.linspace(lo, n_reads, n_depths, dtype=int))
+            depths = depths[depths > 0]
+        depths = depths.astype(int)
+        if depths.size == 0:
+            depths = np.array([n_reads], dtype=int)
+        # bootstrap sampling: for each depth, sample without replacement (if possible)
+        idx_all = np.arange(n_reads)
+        boot_unique = np.zeros((len(depths), n_boot), dtype=float)
+        for di, d in enumerate(depths):
+            d_use = int(min(d, n_reads))
+            # if d_use == n_reads we can short-circuit and set boot results to full observed uniques
+            if d_use == n_reads:
+                # bootstraps are deterministic in this special case
+                uniq_val = float(observed_unique)
+                boot_unique[di, :] = uniq_val
+                continue
+            # otherwise run bootstraps
+            for b in range(n_boot):
+                take = rng.choice(idx_all, size=d_use, replace=False)
+                boot_unique[di, b] = np.unique(clusters[take]).size
+        mean_unique = boot_unique.mean(axis=1)
+        lo_ci = np.percentile(boot_unique, 2.5, axis=1)
+        hi_ci = np.percentile(boot_unique, 97.5, axis=1)
+        # fit Lander-Waterman to the mean curve (safe bounds)
+        C0_init = max(observed_unique, mean_unique[-1] if mean_unique.size else observed_unique)
+        try:
+            popt, _ = curve_fit(
+                lw,
+                xdata=depths.astype(float),
+                ydata=mean_unique.astype(float),
+                p0=[C0_init],
+                bounds=(1.0, 1e12),
+                maxfev=10000,
+            )
+            C0 = float(popt[0])
+        except Exception:
+            C0 = float(observed_unique)
+        # store results
+        results[g] = {
+            "C0": C0,
+            "n_reads": int(n_reads),
+            "n_unique": int(observed_unique),
+            "depths": depths,
+            "mean_unique": mean_unique,
+            "ci_low": lo_ci,
+            "ci_high": hi_ci,
+        }
+        # save per-group in adata.uns for backward compatibility
+        adata.uns[f'Library_complexity_{sanitize(group_label)}'] = results[g]
+        # prepare curve and fit records for CSV
+        fit_records.append({
+            "sample": sample,
+            "reference": ref if ref_col is not None else "",
+            "C0": float(C0),
+            "n_reads": int(n_reads),
+            "n_unique_observed": int(observed_unique),
+        })
+        x_fit = np.linspace(0, max(n_reads, int(depths[-1]) if depths.size else n_reads), 200)
+        y_fit = lw(x_fit, C0)
+        for d, mu, lo, hi in zip(depths, mean_unique, lo_ci, hi_ci):
+            curve_records.append({
+                "sample": sample,
+                "reference": ref if ref_col is not None else "",
+                "type": "bootstrap",
+                "depth": int(d),
+                "mean_unique": float(mu),
+                "ci_low": float(lo),
+                "ci_high": float(hi),
+            })
+        for xf, yf in zip(x_fit, y_fit):
+            curve_records.append({
+                "sample": sample,
+                "reference": ref if ref_col is not None else "",
+                "type": "fit",
+                "depth": float(xf),
+                "mean_unique": float(yf),
+                "ci_low": np.nan,
+                "ci_high": np.nan,
+            })
+        # plotting for this group
+        if plot:
+            plt.figure(figsize=(6.5, 4.5))
+            plt.fill_between(depths, lo_ci, hi_ci, alpha=0.25, label="Bootstrap 95% CI")
+            plt.plot(depths, mean_unique, "o", label="Bootstrap mean")
+            plt.plot([n_reads], [observed_unique], "s", label="Observed (full)")
+            plt.plot(x_fit, y_fit, "-", label=f"LW fit  C0≈{C0:,.0f}")
+            plt.xlabel("Total reads (subsampled depth)")
+            plt.ylabel("Unique molecules (clusters)")
+            title = f"Library Complexity — {sample}" + (f" / {ref}" if ref_col is not None else "")
+            plt.title(title)
+            plt.grid(True, alpha=0.3)
+            plt.legend()
+            plt.tight_layout()
+            if save_plot:
+                fname = f"complexity_{sanitize(group_label)}.png"
+                plt.savefig(os.path.join(output_directory or ".", fname), dpi=160, bbox_inches="tight")
+                plt.close()
+            else:
+                plt.show()
+    # store central results dict
+    adata.uns["Library_complexity_results"] = results
+    # mark complexity analysis as complete
+    adata.uns[uns_flag] = True
+    # CSV outputs
+    if csv_summary and (fit_records or curve_records):
+        fit_df = pd.DataFrame(fit_records)
+        curve_df = pd.DataFrame(curve_records)
+        base = output_directory or "."
+        fit_df.to_csv(os.path.join(base, f"complexity_fit_summary.csv"), index=False)
+        curve_df.to_csv(os.path.join(base, f"complexity_curves.csv"), index=False)
+    return results

smftools 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl

smftools 0.1.7py3-none-any.whl → 0.2.3py3-none-any.whl