PyPI - smftools - Versions diffs - 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl - Mend

smftools 0.2.3py3-none-any.whl → 0.2.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (137) hide show

smftools/__init__.py +6 -8
smftools/_settings.py +4 -6
smftools/_version.py +1 -1
smftools/cli/helpers.py +54 -0
smftools/cli/hmm_adata.py +937 -256
smftools/cli/load_adata.py +448 -268
smftools/cli/preprocess_adata.py +469 -263
smftools/cli/spatial_adata.py +536 -319
smftools/cli_entry.py +97 -182
smftools/config/__init__.py +1 -1
smftools/config/conversion.yaml +17 -6
smftools/config/deaminase.yaml +12 -10
smftools/config/default.yaml +142 -33
smftools/config/direct.yaml +11 -3
smftools/config/discover_input_files.py +19 -5
smftools/config/experiment_config.py +594 -264
smftools/constants.py +37 -0
smftools/datasets/__init__.py +2 -8
smftools/datasets/datasets.py +32 -18
smftools/hmm/HMM.py +2128 -1418
smftools/hmm/__init__.py +2 -9
smftools/hmm/archived/call_hmm_peaks.py +121 -0
smftools/hmm/call_hmm_peaks.py +299 -91
smftools/hmm/display_hmm.py +19 -6
smftools/hmm/hmm_readwrite.py +13 -4
smftools/hmm/nucleosome_hmm_refinement.py +102 -14
smftools/informatics/__init__.py +30 -7
smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
smftools/informatics/archived/print_bam_query_seq.py +7 -1
smftools/informatics/bam_functions.py +397 -175
smftools/informatics/basecalling.py +51 -9
smftools/informatics/bed_functions.py +90 -57
smftools/informatics/binarize_converted_base_identities.py +18 -7
smftools/informatics/complement_base_list.py +7 -6
smftools/informatics/converted_BAM_to_adata.py +265 -122
smftools/informatics/fasta_functions.py +161 -83
smftools/informatics/h5ad_functions.py +196 -30
smftools/informatics/modkit_extract_to_adata.py +609 -270
smftools/informatics/modkit_functions.py +85 -44
smftools/informatics/ohe.py +44 -21
smftools/informatics/pod5_functions.py +112 -73
smftools/informatics/run_multiqc.py +20 -14
smftools/logging_utils.py +51 -0
smftools/machine_learning/__init__.py +2 -7
smftools/machine_learning/data/anndata_data_module.py +143 -50
smftools/machine_learning/data/preprocessing.py +2 -1
smftools/machine_learning/evaluation/__init__.py +1 -1
smftools/machine_learning/evaluation/eval_utils.py +11 -14
smftools/machine_learning/evaluation/evaluators.py +46 -33
smftools/machine_learning/inference/__init__.py +1 -1
smftools/machine_learning/inference/inference_utils.py +7 -4
smftools/machine_learning/inference/lightning_inference.py +9 -13
smftools/machine_learning/inference/sklearn_inference.py +6 -8
smftools/machine_learning/inference/sliding_window_inference.py +35 -25
smftools/machine_learning/models/__init__.py +10 -5
smftools/machine_learning/models/base.py +28 -42
smftools/machine_learning/models/cnn.py +15 -11
smftools/machine_learning/models/lightning_base.py +71 -40
smftools/machine_learning/models/mlp.py +13 -4
smftools/machine_learning/models/positional.py +3 -2
smftools/machine_learning/models/rnn.py +3 -2
smftools/machine_learning/models/sklearn_models.py +39 -22
smftools/machine_learning/models/transformer.py +68 -53
smftools/machine_learning/models/wrappers.py +2 -1
smftools/machine_learning/training/__init__.py +2 -2
smftools/machine_learning/training/train_lightning_model.py +29 -20
smftools/machine_learning/training/train_sklearn_model.py +9 -15
smftools/machine_learning/utils/__init__.py +1 -1
smftools/machine_learning/utils/device.py +7 -4
smftools/machine_learning/utils/grl.py +3 -1
smftools/metadata.py +443 -0
smftools/plotting/__init__.py +19 -5
smftools/plotting/autocorrelation_plotting.py +145 -44
smftools/plotting/classifiers.py +162 -72
smftools/plotting/general_plotting.py +422 -197
smftools/plotting/hmm_plotting.py +42 -13
smftools/plotting/position_stats.py +147 -87
smftools/plotting/qc_plotting.py +20 -12
smftools/preprocessing/__init__.py +10 -12
smftools/preprocessing/append_base_context.py +115 -80
smftools/preprocessing/append_binary_layer_by_base_context.py +77 -39
smftools/preprocessing/{calculate_complexity.py → archived/calculate_complexity.py} +3 -1
smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
smftools/preprocessing/binarize.py +21 -4
smftools/preprocessing/binarize_on_Youden.py +129 -31
smftools/preprocessing/binary_layers_to_ohe.py +17 -11
smftools/preprocessing/calculate_complexity_II.py +86 -59
smftools/preprocessing/calculate_consensus.py +28 -19
smftools/preprocessing/calculate_coverage.py +50 -25
smftools/preprocessing/calculate_pairwise_differences.py +2 -1
smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
smftools/preprocessing/calculate_position_Youden.py +118 -54
smftools/preprocessing/calculate_read_length_stats.py +52 -23
smftools/preprocessing/calculate_read_modification_stats.py +91 -57
smftools/preprocessing/clean_NaN.py +38 -28
smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
smftools/preprocessing/filter_reads_on_length_quality_mapping.py +71 -38
smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
smftools/preprocessing/flag_duplicate_reads.py +689 -272
smftools/preprocessing/invert_adata.py +26 -11
smftools/preprocessing/load_sample_sheet.py +40 -22
smftools/preprocessing/make_dirs.py +8 -3
smftools/preprocessing/min_non_diagonal.py +2 -1
smftools/preprocessing/recipes.py +56 -23
smftools/preprocessing/reindex_references_adata.py +103 -0
smftools/preprocessing/subsample_adata.py +33 -16
smftools/readwrite.py +331 -82
smftools/schema/__init__.py +11 -0
smftools/schema/anndata_schema_v1.yaml +227 -0
smftools/tools/__init__.py +3 -4
smftools/tools/archived/classifiers.py +163 -0
smftools/tools/archived/subset_adata_v1.py +10 -1
smftools/tools/archived/subset_adata_v2.py +12 -1
smftools/tools/calculate_umap.py +54 -15
smftools/tools/cluster_adata_on_methylation.py +115 -46
smftools/tools/general_tools.py +70 -25
smftools/tools/position_stats.py +229 -98
smftools/tools/read_stats.py +50 -29
smftools/tools/spatial_autocorrelation.py +365 -192
smftools/tools/subset_adata.py +23 -21
{smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/METADATA +17 -39
smftools-0.2.5.dist-info/RECORD +181 -0
smftools-0.2.3.dist-info/RECORD +0 -173
/smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
/smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
/smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
/smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
/smftools/preprocessing/{add_read_length_and_mapping_qc.py → archived/add_read_length_and_mapping_qc.py} +0 -0
/smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
/smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
{smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
{smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
{smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0

smftools/plotting/qc_plotting.py CHANGED Viewed

@@ -1,12 +1,8 @@
 import os
-import numpy as np
-import pandas as pd
-import matplotlib.pyplot as plt
-import os
+import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-import matplotlib.pyplot as plt
 def plot_read_qc_histograms(
@@ -83,7 +79,11 @@ def plot_read_qc_histograms(
     for key in valid_keys:
         if not is_numeric[key]:
             continue
-        s = pd.to_numeric(adata.obs[key], errors="coerce").replace([np.inf, -np.inf], np.nan).dropna()
+        s = (
+            pd.to_numeric(adata.obs[key], errors="coerce")
+            .replace([np.inf, -np.inf], np.nan)
+            .dropna()
+        )
         if s.size < min_non_nan:
             # still set something to avoid errors; just use min/max or (0,1)
             lo, hi = (0.0, 1.0) if s.size == 0 else (float(s.min()), float(s.max()))
@@ -99,6 +99,7 @@ def plot_read_qc_histograms(
         global_ranges[key] = (lo, hi)
     def _sanitize(name: str) -> str:
+        """Sanitize a string for use in filenames."""
         return "".join(c if c.isalnum() or c in "-._" else "_" for c in str(name))
     ncols = len(valid_keys)
@@ -107,17 +108,18 @@ def plot_read_qc_histograms(
     fig_h_unit = figsize_cell[1]
     for start in range(0, len(sample_levels), rows_per_fig):
-        chunk = sample_levels[start:start + rows_per_fig]
+        chunk = sample_levels[start : start + rows_per_fig]
         nrows = len(chunk)
         fig, axes = plt.subplots(
-            nrows=nrows, ncols=ncols,
+            nrows=nrows,
+            ncols=ncols,
             figsize=(fig_w, fig_h_unit * nrows),
             dpi=dpi,
             squeeze=False,
         )
         for r, sample_val in enumerate(chunk):
-            row_mask = (adata.obs[sample_key].values == sample_val)
+            row_mask = adata.obs[sample_key].values == sample_val
             n_in_row = int(row_mask.sum())
             for c, key in enumerate(valid_keys):
@@ -125,7 +127,11 @@ def plot_read_qc_histograms(
                 series = adata.obs.loc[row_mask, key]
                 if is_numeric[key]:
-                    x = pd.to_numeric(series, errors="coerce").replace([np.inf, -np.inf], np.nan).dropna()
+                    x = (
+                        pd.to_numeric(series, errors="coerce")
+                        .replace([np.inf, -np.inf], np.nan)
+                        .dropna()
+                    )
                     if x.size < min_non_nan:
                         ax.text(0.5, 0.5, f"n={x.size} (<{min_non_nan})", ha="center", va="center")
                     else:
@@ -143,7 +149,9 @@ def plot_read_qc_histograms(
                 else:
                     vc = series.astype("category").value_counts(dropna=False)
                     if vc.sum() < min_non_nan:
-                        ax.text(0.5, 0.5, f"n={vc.sum()} (<{min_non_nan})", ha="center", va="center")
+                        ax.text(
+                            0.5, 0.5, f"n={vc.sum()} (<{min_non_nan})", ha="center", va="center"
+                        )
                     else:
                         vc_top = vc.iloc[:topn_categories][::-1]  # show top-N, reversed for barh
                         ax.barh(vc_top.index.astype(str), vc_top.values)
@@ -267,4 +275,4 @@ def plot_read_qc_histograms(
 #                 fname = f"{key}_{sample_key}_{safe_group}.png" if sample_key else f"{key}.png"
 #                 fname = fname.replace("/", "_")
 #                 fig.savefig(os.path.join(outdir, fname))
-#                 plt.close(fig)
+#                 plt.close(fig)

smftools/preprocessing/__init__.py CHANGED Viewed

@@ -1,40 +1,38 @@
-from .add_read_length_and_mapping_qc import add_read_length_and_mapping_qc
 from .append_base_context import append_base_context
 from .append_binary_layer_by_base_context import append_binary_layer_by_base_context
-from .binarize_on_Youden import binarize_on_Youden
 from .binarize import binarize_adata
-from .calculate_complexity import calculate_complexity
+from .binarize_on_Youden import binarize_on_Youden
 from .calculate_complexity_II import calculate_complexity_II
-from .calculate_read_modification_stats import calculate_read_modification_stats
 from .calculate_coverage import calculate_coverage
 from .calculate_position_Youden import calculate_position_Youden
 from .calculate_read_length_stats import calculate_read_length_stats
+from .calculate_read_modification_stats import calculate_read_modification_stats
 from .clean_NaN import clean_NaN
 from .filter_adata_by_nan_proportion import filter_adata_by_nan_proportion
-from .filter_reads_on_modification_thresholds import filter_reads_on_modification_thresholds
 from .filter_reads_on_length_quality_mapping import filter_reads_on_length_quality_mapping
+from .filter_reads_on_modification_thresholds import filter_reads_on_modification_thresholds
+from .flag_duplicate_reads import flag_duplicate_reads
 from .invert_adata import invert_adata
 from .load_sample_sheet import load_sample_sheet
-from .flag_duplicate_reads import flag_duplicate_reads
+from .reindex_references_adata import reindex_references_adata
 from .subsample_adata import subsample_adata
 __all__ = [
-    "add_read_length_and_mapping_qc",
     "append_base_context",
     "append_binary_layer_by_base_context",
     "binarize_on_Youden",
     "binarize_adata",
-    "calculate_complexity",
+    "calculate_complexity_II",
     "calculate_read_modification_stats",
-    "calculate_coverage",
+    "calculate_coverage",
     "calculate_position_Youden",
     "calculate_read_length_stats",
-    "clean_NaN",
+    "clean_NaN",
     "filter_adata_by_nan_proportion",
     "filter_reads_on_modification_thresholds",
     "filter_reads_on_length_quality_mapping",
     "invert_adata",
     "load_sample_sheet",
     "flag_duplicate_reads",
-    "subsample_adata"
-]
+    "subsample_adata",
+]

smftools/preprocessing/append_base_context.py CHANGED Viewed

@@ -1,27 +1,38 @@
-def append_base_context(adata,
-                        obs_column='Reference_strand',
-                        use_consensus=False,
-                        native=False,
-                        mod_target_bases=['GpC', 'CpG'],
-                        bypass=False,
-                        force_redo=False,
-                        uns_flag='base_context_added'
-):
-    """
-    Adds nucleobase context to the position within the given category. When use_consensus is True, it uses the consensus sequence, otherwise it defaults to the FASTA sequence.
-    Parameters:
-        adata (AnnData): The input adata object.
-        obs_column (str): The observation column in which to stratify on. Default is 'Reference_strand', which should not be changed for most purposes.
-        use_consensus (bool): A truth statement indicating whether to use the consensus sequence from the reads mapped to the reference. If False, the reference FASTA is used instead.
-        native (bool): If False, perform conversion SMF assumptions. If True, perform native SMF assumptions
-        mod_target_bases (list): Base contexts that may be modified.
-    Returns:
-        None
+from __future__ import annotations
+from typing import TYPE_CHECKING
+from smftools.logging_utils import get_logger
+if TYPE_CHECKING:
+    import anndata as ad
+logger = get_logger(__name__)
+def append_base_context(
+    adata: "ad.AnnData",
+    ref_column: str = "Reference_strand",
+    use_consensus: bool = False,
+    native: bool = False,
+    mod_target_bases: list[str] = ["GpC", "CpG"],
+    bypass: bool = False,
+    force_redo: bool = False,
+    uns_flag: str = "append_base_context_performed",
+) -> None:
+    """Append base context annotations to ``adata``.
+    Args:
+        adata: AnnData object.
+        ref_column: Obs column used to stratify references.
+        use_consensus: Whether to use consensus sequences rather than FASTA references.
+        native: If ``True``, use native SMF assumptions; otherwise use conversion assumptions.
+        mod_target_bases: Base contexts that may be modified.
+        bypass: Whether to skip processing.
+        force_redo: Whether to rerun even if ``uns_flag`` is set.
+        uns_flag: Flag in ``adata.uns`` indicating prior completion.
     """
     import numpy as np
-    import anndata as ad
     # Only run if not already performed
     already = bool(adata.uns.get(uns_flag, False))
@@ -29,94 +40,118 @@ def append_base_context(adata,
         # QC already performed; nothing to do
         return
-    print('Adding base context based on reference FASTA sequence for sample')
-    categories = adata.obs[obs_column].cat.categories
+    logger.info("Adding base context based on reference FASTA sequence for sample")
+    references = adata.obs[ref_column].cat.categories
     site_types = []
-    if any(base in mod_target_bases for base in ['GpC', 'CpG', 'C']):
-        site_types += ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C_site', 'C_site']
-    if 'A' in mod_target_bases:
-        site_types += ['A_site']
-    for cat in categories:
+    if any(base in mod_target_bases for base in ["GpC", "CpG", "C"]):
+        site_types += ["GpC_site", "CpG_site", "ambiguous_GpC_CpG_site", "other_C_site", "C_site"]
+    if "A" in mod_target_bases:
+        site_types += ["A_site"]
+    for ref in references:
         # Assess if the strand is the top or bottom strand converted
-        if 'top' in cat:
-            strand = 'top'
-        elif 'bottom' in cat:
-            strand = 'bottom'
+        if "top" in ref:
+            strand = "top"
+        elif "bottom" in ref:
+            strand = "bottom"
         if native:
-            basename = cat.split(f"_{strand}")[0]
+            basename = ref.split(f"_{strand}")[0]
             if use_consensus:
-                sequence = adata.uns[f'{basename}_consensus_sequence']
+                sequence = adata.uns[f"{basename}_consensus_sequence"]
             else:
                 # This sequence is the unconverted FASTA sequence of the original input FASTA for the locus
-                sequence = adata.uns[f'{basename}_FASTA_sequence']
+                sequence = adata.uns[f"{basename}_FASTA_sequence"]
         else:
-            basename = cat.split(f"_{strand}")[0]
+            basename = ref.split(f"_{strand}")[0]
             if use_consensus:
-                sequence = adata.uns[f'{basename}_consensus_sequence']
+                sequence = adata.uns[f"{basename}_consensus_sequence"]
             else:
                 # This sequence is the unconverted FASTA sequence of the original input FASTA for the locus
-                sequence = adata.uns[f'{basename}_FASTA_sequence']
-        # Init a dict keyed by reference site type that points to a bool of whether the position is that site type.
+                sequence = adata.uns[f"{basename}_FASTA_sequence"]
+        # Init a dict keyed by reference site type that points to a bool of whether the position is that site type.
         boolean_dict = {}
         for site_type in site_types:
-            boolean_dict[f'{cat}_{site_type}'] = np.full(len(sequence), False, dtype=bool)
+            boolean_dict[f"{ref}_{site_type}"] = np.full(len(sequence), False, dtype=bool)
-        if any(base in mod_target_bases for base in ['GpC', 'CpG', 'C']):
-            if strand == 'top':
+        if any(base in mod_target_bases for base in ["GpC", "CpG", "C"]):
+            if strand == "top":
                 # Iterate through the sequence and apply the criteria
                 for i in range(1, len(sequence) - 1):
-                    if sequence[i] == 'C':
-                        boolean_dict[f'{cat}_C_site'][i] = True
-                        if sequence[i - 1] == 'G' and sequence[i + 1] != 'G':
-                            boolean_dict[f'{cat}_GpC_site'][i] = True
-                        elif sequence[i - 1] == 'G' and sequence[i + 1] == 'G':
-                            boolean_dict[f'{cat}_ambiguous_GpC_CpG_site'][i] = True
-                        elif sequence[i - 1] != 'G' and sequence[i + 1] == 'G':
-                            boolean_dict[f'{cat}_CpG_site'][i] = True
-                        elif sequence[i - 1] != 'G' and sequence[i + 1] != 'G':
-                            boolean_dict[f'{cat}_other_C_site'][i] = True
-            elif strand == 'bottom':
+                    if sequence[i] == "C":
+                        boolean_dict[f"{ref}_C_site"][i] = True
+                        if sequence[i - 1] == "G" and sequence[i + 1] != "G":
+                            boolean_dict[f"{ref}_GpC_site"][i] = True
+                        elif sequence[i - 1] == "G" and sequence[i + 1] == "G":
+                            boolean_dict[f"{ref}_ambiguous_GpC_CpG_site"][i] = True
+                        elif sequence[i - 1] != "G" and sequence[i + 1] == "G":
+                            boolean_dict[f"{ref}_CpG_site"][i] = True
+                        elif sequence[i - 1] != "G" and sequence[i + 1] != "G":
+                            boolean_dict[f"{ref}_other_C_site"][i] = True
+            elif strand == "bottom":
                 # Iterate through the sequence and apply the criteria
                 for i in range(1, len(sequence) - 1):
-                    if sequence[i] == 'G':
-                        boolean_dict[f'{cat}_C_site'][i] = True
-                        if sequence[i + 1] == 'C' and sequence[i - 1] != 'C':
-                            boolean_dict[f'{cat}_GpC_site'][i] = True
-                        elif sequence[i - 1] == 'C' and sequence[i + 1] == 'C':
-                            boolean_dict[f'{cat}_ambiguous_GpC_CpG_site'][i] = True
-                        elif sequence[i - 1] == 'C' and sequence[i + 1] != 'C':
-                            boolean_dict[f'{cat}_CpG_site'][i] = True
-                        elif sequence[i - 1] != 'C' and sequence[i + 1] != 'C':
-                            boolean_dict[f'{cat}_other_C_site'][i] = True
+                    if sequence[i] == "G":
+                        boolean_dict[f"{ref}_C_site"][i] = True
+                        if sequence[i + 1] == "C" and sequence[i - 1] != "C":
+                            boolean_dict[f"{ref}_GpC_site"][i] = True
+                        elif sequence[i - 1] == "C" and sequence[i + 1] == "C":
+                            boolean_dict[f"{ref}_ambiguous_GpC_CpG_site"][i] = True
+                        elif sequence[i - 1] == "C" and sequence[i + 1] != "C":
+                            boolean_dict[f"{ref}_CpG_site"][i] = True
+                        elif sequence[i - 1] != "C" and sequence[i + 1] != "C":
+                            boolean_dict[f"{ref}_other_C_site"][i] = True
             else:
-                print('Error: top or bottom strand of conversion could not be determined. Ensure this value is in the Reference name.')
+                logger.error(
+                    "Top or bottom strand of conversion could not be determined. Ensure this value is in the Reference name."
+                )
-        if 'A' in mod_target_bases:
-            if strand == 'top':
+        if "A" in mod_target_bases:
+            if strand == "top":
                 # Iterate through the sequence and apply the criteria
                 for i in range(1, len(sequence) - 1):
-                    if sequence[i] == 'A':
-                        boolean_dict[f'{cat}_A_site'][i] = True
-            elif strand == 'bottom':
+                    if sequence[i] == "A":
+                        boolean_dict[f"{ref}_A_site"][i] = True
+            elif strand == "bottom":
                 # Iterate through the sequence and apply the criteria
                 for i in range(1, len(sequence) - 1):
-                    if sequence[i] == 'T':
-                        boolean_dict[f'{cat}_A_site'][i] = True
+                    if sequence[i] == "T":
+                        boolean_dict[f"{ref}_A_site"][i] = True
             else:
-                print('Error: top or bottom strand of conversion could not be determined. Ensure this value is in the Reference name.')
+                logger.error(
+                    "Top or bottom strand of conversion could not be determined. Ensure this value is in the Reference name."
+                )
         for site_type in site_types:
-            adata.var[f'{cat}_{site_type}'] = boolean_dict[f'{cat}_{site_type}'].astype(bool)
+            # Site context annotations for each reference
+            adata.var[f"{ref}_{site_type}"] = boolean_dict[f"{ref}_{site_type}"].astype(bool)
+            # Restrict the site type labels to only be in positions that occur at a high enough frequency in the dataset
+            if adata.uns.get("calculate_coverage_performed", False):
+                adata.var[f"{ref}_{site_type}_valid_coverage"] = (
+                    (adata.var[f"{ref}_{site_type}"]) & (adata.var[f"position_in_{ref}"])
+                )
+                if native:
+                    adata.obsm[f"{ref}_{site_type}_valid_coverage"] = adata[
+                        :, adata.var[f"{ref}_{site_type}_valid_coverage"]
+                    ].layers["binarized_methylation"]
+                else:
+                    adata.obsm[f"{ref}_{site_type}_valid_coverage"] = adata[
+                        :, adata.var[f"{ref}_{site_type}_valid_coverage"]
+                    ].X
+            else:
+                pass
             if native:
-                adata.obsm[f'{cat}_{site_type}'] = adata[:, adata.var[f'{cat}_{site_type}'] == True].layers['binarized_methylation']
+                adata.obsm[f"{ref}_{site_type}"] = adata[:, adata.var[f"{ref}_{site_type}"]].layers[
+                    "binarized_methylation"
+                ]
             else:
-                adata.obsm[f'{cat}_{site_type}'] = adata[:, adata.var[f'{cat}_{site_type}'] == True].X
+                adata.obsm[f"{ref}_{site_type}"] = adata[:, adata.var[f"{ref}_{site_type}"]].X
     # mark as done
     adata.uns[uns_flag] = True
-    return None
+    return None

smftools/preprocessing/append_binary_layer_by_base_context.py CHANGED Viewed

@@ -1,33 +1,51 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
 import numpy as np
 import scipy.sparse as sp
+from smftools.logging_utils import get_logger
+if TYPE_CHECKING:
+    import anndata as ad
+logger = get_logger(__name__)
 def append_binary_layer_by_base_context(
-    adata,
+    adata: "ad.AnnData",
     reference_column: str,
     smf_modality: str = "conversion",
     verbose: bool = True,
-    uns_flag: str = "binary_layers_by_base_context_added",
+    uns_flag: str = "append_binary_layer_by_base_context_performed",
     bypass: bool = False,
-    force_redo: bool = False
-):
-    """
-    Build per-reference C/G-site masked layers:
-      - GpC_site_binary
-      - CpG_site_binary
-      - GpC_CpG_combined_site_binary (numeric sum where present; NaN where neither present)
-      - C_site_binary
-      - other_C_site_binary
-    Behavior:
-      - If X is sparse it will be converted to dense for these layers (keeps original adata.X untouched).
-      - Missing var columns are warned about but do not crash.
-      - Masked positions are filled with np.nan to make masked vs unmasked explicit.
-      - Requires append_base_context to be run first
+    force_redo: bool = False,
+    from_valid_sites_only: bool = False,
+    valid_site_col_suffix: str = "_valid_coverage",
+) -> "ad.AnnData":
+    """Build per-reference masked layers for base-context sites.
+    Args:
+        adata: AnnData object to annotate.
+        reference_column: Obs column containing reference identifiers.
+        smf_modality: SMF modality identifier.
+        verbose: Whether to log layer summary information.
+        uns_flag: Flag in ``adata.uns`` indicating prior completion.
+        bypass: Whether to skip processing.
+        force_redo: Whether to rerun even if ``uns_flag`` is set.
+        from_valid_sites_only: Whether to use valid-coverage site masks only.
+        valid_site_col_suffix: Suffix for valid-coverage site columns.
+    Returns:
+        anndata.AnnData: AnnData object with new masked layers.
     """
+    if not from_valid_sites_only:
+        valid_site_col_suffix = ""
     # Only run if not already performed
     already = bool(adata.uns.get(uns_flag, False))
-    if (already and not force_redo) or bypass or ("base_context_added" not in adata.uns):
+    if (already and not force_redo) or bypass or ("append_base_context_performed" not in adata.uns):
         # QC already performed; nothing to do
         return adata
@@ -46,17 +64,25 @@ def append_binary_layer_by_base_context(
     # expected per-reference var column names
     references = adata.obs[reference_column].astype("category").cat.categories
-    reference_to_gpc_column = {ref: f"{ref}_GpC_site" for ref in references}
-    reference_to_cpg_column = {ref: f"{ref}_CpG_site" for ref in references}
-    reference_to_c_column = {ref: f"{ref}_C_site" for ref in references}
-    reference_to_other_c_column = {ref: f"{ref}_other_C_site" for ref in references}
+    reference_to_gpc_column = {ref: f"{ref}_GpC_site{valid_site_col_suffix}" for ref in references}
+    reference_to_cpg_column = {ref: f"{ref}_CpG_site{valid_site_col_suffix}" for ref in references}
+    reference_to_c_column = {ref: f"{ref}_C_site{valid_site_col_suffix}" for ref in references}
+    reference_to_other_c_column = {
+        ref: f"{ref}_other_C_site{valid_site_col_suffix}" for ref in references
+    }
+    reference_to_a_column = {ref: f"{ref}_A_site{valid_site_col_suffix}" for ref in references}
     # verify var columns exist and build boolean masks per ref (len = n_vars)
     n_obs, n_vars = adata.shape
     def _col_mask_or_warn(colname):
+        """Return a boolean mask for a var column, or all-False if missing."""
         if colname not in adata.var.columns:
             if verbose:
-                print(f"Warning: var column '{colname}' not found; treating as all-False mask.")
+                logger.warning(
+                    "Var column '%s' not found; treating as all-False mask.",
+                    colname,
+                )
             return np.zeros(n_vars, dtype=bool)
         vals = adata.var[colname].values
         # coerce truthiness
@@ -67,14 +93,17 @@ def append_binary_layer_by_base_context(
     gpc_var_masks = {ref: _col_mask_or_warn(col) for ref, col in reference_to_gpc_column.items()}
     cpg_var_masks = {ref: _col_mask_or_warn(col) for ref, col in reference_to_cpg_column.items()}
-    c_var_masks =   {ref: _col_mask_or_warn(col) for ref, col in reference_to_c_column.items()}
-    other_c_var_masks = {ref: _col_mask_or_warn(col) for ref, col in reference_to_other_c_column.items()}
+    c_var_masks = {ref: _col_mask_or_warn(col) for ref, col in reference_to_c_column.items()}
+    other_c_var_masks = {
+        ref: _col_mask_or_warn(col) for ref, col in reference_to_other_c_column.items()
+    }
+    a_var_masks = {ref: _col_mask_or_warn(col) for ref, col in reference_to_a_column.items()}
     # prepare X as dense float32 for layer filling (we leave adata.X untouched)
     X = adata.X
     if sp.issparse(X):
         if verbose:
-            print("Converting sparse X to dense array for layer construction (temporary).")
+            logger.info("Converting sparse X to dense array for layer construction (temporary).")
         X = X.toarray()
     X = np.asarray(X, dtype=np.float32)
@@ -83,11 +112,12 @@ def append_binary_layer_by_base_context(
     masked_cpg = np.full((n_obs, n_vars), np.nan, dtype=np.float32)
     masked_any_c = np.full((n_obs, n_vars), np.nan, dtype=np.float32)
     masked_other_c = np.full((n_obs, n_vars), np.nan, dtype=np.float32)
+    masked_a = np.full((n_obs, n_vars), np.nan, dtype=np.float32)
     # fill row-blocks per reference (this avoids creating a full row×var boolean mask)
     obs_ref_series = adata.obs[reference_column]
     for ref in references:
-        rows_mask = (obs_ref_series.values == ref)
+        rows_mask = obs_ref_series.values == ref
         if not rows_mask.any():
             continue
         row_idx = np.nonzero(rows_mask)[0]  # integer indices of rows for this ref
@@ -95,8 +125,9 @@ def append_binary_layer_by_base_context(
         # column masks for this ref
         gpc_cols = gpc_var_masks.get(ref, np.zeros(n_vars, dtype=bool))
         cpg_cols = cpg_var_masks.get(ref, np.zeros(n_vars, dtype=bool))
-        c_cols   = c_var_masks.get(ref, np.zeros(n_vars, dtype=bool))
+        c_cols = c_var_masks.get(ref, np.zeros(n_vars, dtype=bool))
         other_c_cols = other_c_var_masks.get(ref, np.zeros(n_vars, dtype=bool))
+        a_cols = a_var_masks.get(ref, np.zeros(n_vars, dtype=bool))
         if gpc_cols.any():
             # assign only the submatrix (rows x selected cols)
@@ -107,6 +138,8 @@ def append_binary_layer_by_base_context(
             masked_any_c[np.ix_(row_idx, c_cols)] = X[np.ix_(row_idx, c_cols)]
         if other_c_cols.any():
             masked_other_c[np.ix_(row_idx, other_c_cols)] = X[np.ix_(row_idx, other_c_cols)]
+        if a_cols.any():
+            masked_a[np.ix_(row_idx, other_c_cols)] = X[np.ix_(row_idx, other_c_cols)]
     # Build combined layer:
     # - numeric_sum: sum where either exists, NaN where neither exists
@@ -121,21 +154,26 @@ def append_binary_layer_by_base_context(
     # combined_bool = (~gpc_nan & (masked_gpc != 0)) | (~cpg_nan & (masked_cpg != 0))
     # combined_layer = combined_bool.astype(np.float32)
-    adata.layers['GpC_site_binary'] = masked_gpc
-    adata.layers['CpG_site_binary'] = masked_cpg
-    adata.layers['GpC_CpG_combined_site_binary'] = combined_sum
-    adata.layers['C_site_binary'] = masked_any_c
-    adata.layers['other_C_site_binary'] = masked_other_c
+    adata.layers["GpC_site_binary"] = masked_gpc
+    adata.layers["CpG_site_binary"] = masked_cpg
+    adata.layers["GpC_CpG_combined_site_binary"] = combined_sum
+    adata.layers["C_site_binary"] = masked_any_c
+    adata.layers["other_C_site_binary"] = masked_other_c
+    adata.layers["A_site_binary"] = masked_a
     if verbose:
         def _filled_positions(arr):
+            """Count the number of non-NaN positions in an array."""
             return int(np.sum(~np.isnan(arr)))
-        print("Layer build summary (non-NaN cell counts):")
-        print(f"  GpC: {_filled_positions(masked_gpc)}")
-        print(f"  CpG: {_filled_positions(masked_cpg)}")
-        print(f"  GpC+CpG combined: {_filled_positions(combined_sum)}")
-        print(f"  C: {_filled_positions(masked_any_c)}")
-        print(f"  other_C: {_filled_positions(masked_other_c)}")
+        logger.info("Layer build summary (non-NaN cell counts):")
+        logger.info("  GpC: %s", _filled_positions(masked_gpc))
+        logger.info("  CpG: %s", _filled_positions(masked_cpg))
+        logger.info("  GpC+CpG combined: %s", _filled_positions(combined_sum))
+        logger.info("  C: %s", _filled_positions(masked_any_c))
+        logger.info("  other_C: %s", _filled_positions(masked_other_c))
+        logger.info("  A: %s", _filled_positions(masked_a))
     # mark as done
     adata.uns[uns_flag] = True

smftools/preprocessing/{calculate_complexity.py → archived/calculate_complexity.py} RENAMED Viewed

@@ -21,9 +21,11 @@ def calculate_complexity(adata, output_directory='', obs_column='Reference', sam
     from scipy.optimize import curve_fit
     def lander_waterman(x, C0):
+        """Lander-Waterman curve for complexity estimation."""
         return C0 * (1 - np.exp(-x / C0))
     def count_unique_reads(reads, depth):
+        """Count unique reads in a subsample of the given depth."""
         subsample = np.random.choice(reads, depth, replace=False)
         return len(np.unique(subsample))
@@ -69,4 +71,4 @@ def calculate_complexity(adata, output_directory='', obs_column='Reference', sam
                     plt.savefig(save_name, bbox_inches='tight', pad_inches=0.1)
                     plt.close()
                 else:
-                    plt.show()
+                    plt.show()

smftools/preprocessing/{archives → archived}/preprocessing.py RENAMED Viewed

@@ -322,12 +322,14 @@ def min_non_diagonal(matrix):
         min_values.append(np.min(row))
     return min_values
-def lander_waterman(x, C0):
-    return C0 * (1 - np.exp(-x / C0))
+    def lander_waterman(x, C0):
+        """Lander-Waterman curve for complexity estimation."""
+        return C0 * (1 - np.exp(-x / C0))
-def count_unique_reads(reads, depth):
-    subsample = np.random.choice(reads, depth, replace=False)
-    return len(np.unique(subsample))
+    def count_unique_reads(reads, depth):
+        """Count unique reads in a subsample of the given depth."""
+        subsample = np.random.choice(reads, depth, replace=False)
+        return len(np.unique(subsample))
 def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_names'):
     """
@@ -611,4 +613,4 @@ def binarize_on_Youden(adata, obs_column='Reference'):
     # Pull back the new binarized layers into the original adata object
     adata.layers['binarized_methylation'] = temp_adata.layers['binarized_methylation']
-######################################################################################################
+######################################################################################################

smftools/preprocessing/binarize.py CHANGED Viewed

@@ -1,9 +1,26 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
 import numpy as np
-def binarize_adata(adata, source="X", target_layer="binary", threshold=0.8):
-    """
-    Binarize a dense matrix and preserve NaN.
-    source: "X" or layer name
+if TYPE_CHECKING:
+    import anndata as ad
+def binarize_adata(
+    adata: "ad.AnnData",
+    source: str = "X",
+    target_layer: str = "binary",
+    threshold: float = 0.8,
+) -> None:
+    """Binarize a dense matrix and preserve NaNs.
+    Args:
+        adata: AnnData object with input matrix or layer.
+        source: ``"X"`` to use the main matrix or a layer name.
+        target_layer: Layer name to store the binarized values.
+        threshold: Threshold above which values are set to 1.
     """
     X = adata.X if source == "X" else adata.layers[source]

smftools 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

smftools 0.2.3py3-none-any.whl → 0.2.5py3-none-any.whl