PyPI - smftools - Versions diffs - 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl - Mend

smftools 0.2.4py3-none-any.whl → 0.2.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (133) hide show

smftools/__init__.py +6 -8
smftools/_settings.py +4 -6
smftools/_version.py +1 -1
smftools/cli/helpers.py +7 -1
smftools/cli/hmm_adata.py +902 -244
smftools/cli/load_adata.py +318 -198
smftools/cli/preprocess_adata.py +285 -171
smftools/cli/spatial_adata.py +137 -53
smftools/cli_entry.py +94 -178
smftools/config/__init__.py +1 -1
smftools/config/conversion.yaml +5 -1
smftools/config/deaminase.yaml +1 -1
smftools/config/default.yaml +22 -17
smftools/config/direct.yaml +8 -3
smftools/config/discover_input_files.py +19 -5
smftools/config/experiment_config.py +505 -276
smftools/constants.py +37 -0
smftools/datasets/__init__.py +2 -8
smftools/datasets/datasets.py +32 -18
smftools/hmm/HMM.py +2125 -1426
smftools/hmm/__init__.py +2 -3
smftools/hmm/archived/call_hmm_peaks.py +16 -1
smftools/hmm/call_hmm_peaks.py +173 -193
smftools/hmm/display_hmm.py +19 -6
smftools/hmm/hmm_readwrite.py +13 -4
smftools/hmm/nucleosome_hmm_refinement.py +102 -14
smftools/informatics/__init__.py +30 -7
smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
smftools/informatics/archived/print_bam_query_seq.py +7 -1
smftools/informatics/bam_functions.py +379 -156
smftools/informatics/basecalling.py +51 -9
smftools/informatics/bed_functions.py +90 -57
smftools/informatics/binarize_converted_base_identities.py +18 -7
smftools/informatics/complement_base_list.py +7 -6
smftools/informatics/converted_BAM_to_adata.py +265 -122
smftools/informatics/fasta_functions.py +161 -83
smftools/informatics/h5ad_functions.py +195 -29
smftools/informatics/modkit_extract_to_adata.py +609 -270
smftools/informatics/modkit_functions.py +85 -44
smftools/informatics/ohe.py +44 -21
smftools/informatics/pod5_functions.py +112 -73
smftools/informatics/run_multiqc.py +20 -14
smftools/logging_utils.py +51 -0
smftools/machine_learning/__init__.py +2 -7
smftools/machine_learning/data/anndata_data_module.py +143 -50
smftools/machine_learning/data/preprocessing.py +2 -1
smftools/machine_learning/evaluation/__init__.py +1 -1
smftools/machine_learning/evaluation/eval_utils.py +11 -14
smftools/machine_learning/evaluation/evaluators.py +46 -33
smftools/machine_learning/inference/__init__.py +1 -1
smftools/machine_learning/inference/inference_utils.py +7 -4
smftools/machine_learning/inference/lightning_inference.py +9 -13
smftools/machine_learning/inference/sklearn_inference.py +6 -8
smftools/machine_learning/inference/sliding_window_inference.py +35 -25
smftools/machine_learning/models/__init__.py +10 -5
smftools/machine_learning/models/base.py +28 -42
smftools/machine_learning/models/cnn.py +15 -11
smftools/machine_learning/models/lightning_base.py +71 -40
smftools/machine_learning/models/mlp.py +13 -4
smftools/machine_learning/models/positional.py +3 -2
smftools/machine_learning/models/rnn.py +3 -2
smftools/machine_learning/models/sklearn_models.py +39 -22
smftools/machine_learning/models/transformer.py +68 -53
smftools/machine_learning/models/wrappers.py +2 -1
smftools/machine_learning/training/__init__.py +2 -2
smftools/machine_learning/training/train_lightning_model.py +29 -20
smftools/machine_learning/training/train_sklearn_model.py +9 -15
smftools/machine_learning/utils/__init__.py +1 -1
smftools/machine_learning/utils/device.py +7 -4
smftools/machine_learning/utils/grl.py +3 -1
smftools/metadata.py +443 -0
smftools/plotting/__init__.py +19 -5
smftools/plotting/autocorrelation_plotting.py +145 -44
smftools/plotting/classifiers.py +162 -72
smftools/plotting/general_plotting.py +347 -168
smftools/plotting/hmm_plotting.py +42 -13
smftools/plotting/position_stats.py +145 -85
smftools/plotting/qc_plotting.py +20 -12
smftools/preprocessing/__init__.py +8 -8
smftools/preprocessing/append_base_context.py +105 -79
smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
smftools/preprocessing/{archives → archived}/calculate_complexity.py +3 -1
smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
smftools/preprocessing/binarize.py +21 -4
smftools/preprocessing/binarize_on_Youden.py +127 -31
smftools/preprocessing/binary_layers_to_ohe.py +17 -11
smftools/preprocessing/calculate_complexity_II.py +86 -59
smftools/preprocessing/calculate_consensus.py +28 -19
smftools/preprocessing/calculate_coverage.py +44 -22
smftools/preprocessing/calculate_pairwise_differences.py +2 -1
smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
smftools/preprocessing/calculate_position_Youden.py +103 -55
smftools/preprocessing/calculate_read_length_stats.py +52 -23
smftools/preprocessing/calculate_read_modification_stats.py +91 -57
smftools/preprocessing/clean_NaN.py +38 -28
smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
smftools/preprocessing/filter_reads_on_length_quality_mapping.py +70 -37
smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
smftools/preprocessing/flag_duplicate_reads.py +688 -271
smftools/preprocessing/invert_adata.py +26 -11
smftools/preprocessing/load_sample_sheet.py +40 -22
smftools/preprocessing/make_dirs.py +8 -3
smftools/preprocessing/min_non_diagonal.py +2 -1
smftools/preprocessing/recipes.py +56 -23
smftools/preprocessing/reindex_references_adata.py +93 -27
smftools/preprocessing/subsample_adata.py +33 -16
smftools/readwrite.py +264 -109
smftools/schema/__init__.py +11 -0
smftools/schema/anndata_schema_v1.yaml +227 -0
smftools/tools/__init__.py +3 -4
smftools/tools/archived/classifiers.py +163 -0
smftools/tools/archived/subset_adata_v1.py +10 -1
smftools/tools/archived/subset_adata_v2.py +12 -1
smftools/tools/calculate_umap.py +54 -15
smftools/tools/cluster_adata_on_methylation.py +115 -46
smftools/tools/general_tools.py +70 -25
smftools/tools/position_stats.py +229 -98
smftools/tools/read_stats.py +50 -29
smftools/tools/spatial_autocorrelation.py +365 -192
smftools/tools/subset_adata.py +23 -21
{smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/METADATA +15 -43
smftools-0.2.5.dist-info/RECORD +181 -0
smftools-0.2.4.dist-info/RECORD +0 -176
/smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +0 -0
/smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
/smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
{smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
{smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
{smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0

smftools/preprocessing/binarize_on_Youden.py CHANGED Viewed

@@ -1,47 +1,143 @@
-def binarize_on_Youden(adata,
-                       ref_column='Reference_strand',
-                       output_layer_name='binarized_methylation'):
-    """
-    Binarize SMF values based on position thresholds determined by calculate_position_Youden.
+from __future__ import annotations
+from typing import TYPE_CHECKING
+from smftools.logging_utils import get_logger
+if TYPE_CHECKING:
+    import anndata as ad
-    Parameters:
-        adata (AnnData): The anndata object to binarize. `calculate_position_Youden` must have been run first.
-        obs_column (str): The obs column to stratify on. Needs to match what was passed in `calculate_position_Youden`.
+logger = get_logger(__name__)
-    Modifies:
-        Adds a new layer to `adata.layers['binarized_methylation']` containing the binarized methylation matrix.
+def binarize_on_Youden(
+    adata: "ad.AnnData",
+    ref_column: str = "Reference_strand",
+    output_layer_name: str = "binarized_methylation",
+    mask_failed_positions: bool = True,
+) -> None:
+    """Binarize SMF values using thresholds from ``calculate_position_Youden``.
+    Args:
+        adata: AnnData object to binarize.
+        ref_column: Obs column denoting reference/strand categories.
+        output_layer_name: Layer in which to store the binarized matrix.
+        mask_failed_positions: If ``True``, positions that failed Youden QC are set to NaN;
+            otherwise all positions are binarized.
     """
     import numpy as np
-    import anndata as ad
-    # Initialize an empty matrix to store the binarized methylation values
-    binarized_methylation = np.full_like(adata.X, np.nan, dtype=float)  # Keeps same shape as adata.X
+    # Extract dense X once
+    X = adata.X
+    if hasattr(X, "toarray"):  # sparse → dense
+        X = X.toarray()
+    n_obs, n_var = X.shape
+    binarized = np.full((n_obs, n_var), np.nan, dtype=float)
-    # Get unique categories
     references = adata.obs[ref_column].cat.categories
+    ref_labels = adata.obs[ref_column].to_numpy()
     for ref in references:
-        # Select subset for this category
-        ref_mask = adata.obs[ref_column] == ref
-        ref_subset = adata[ref_mask]
+        logger.info("Binarizing on Youden statistics for %s", ref)
+        ref_mask = ref_labels == ref
+        if not np.any(ref_mask):
+            continue
+        X_block = X[ref_mask, :].astype(float, copy=True)
+        # thresholds: list of (threshold, J)
+        youden_stats = adata.var[f"{ref}_position_methylation_thresholding_Youden_stats"].to_numpy()
+        thresholds = np.array(
+            [t[0] if isinstance(t, (tuple, list)) else np.nan for t in youden_stats],
+            dtype=float,
+        )
+        # QC mask
+        qc_mask = adata.var[f"{ref}_position_passed_Youden_thresholding_QC"].to_numpy().astype(bool)
+        if mask_failed_positions:
+            # Only binarize positions passing QC
+            cols_to_binarize = np.where(qc_mask)[0]
+        else:
+            # Binarize all positions
+            cols_to_binarize = np.arange(n_var)
+        # Prepare result block
+        block_out = np.full_like(X_block, np.nan, dtype=float)
+        if len(cols_to_binarize) > 0:
+            sub_X = X_block[:, cols_to_binarize]
+            sub_thresh = thresholds[cols_to_binarize]
+            nan_mask = np.isnan(sub_X)
+            bin_sub = (sub_X > sub_thresh[None, :]).astype(float)
+            bin_sub[nan_mask] = np.nan
+            block_out[:, cols_to_binarize] = bin_sub
+        # Write into full output matrix
+        binarized[ref_mask, :] = block_out
+    adata.layers[output_layer_name] = binarized
+    logger.info(
+        "Finished binarization → stored in adata.layers['%s'] (mask_failed_positions=%s)",
+        output_layer_name,
+        mask_failed_positions,
+    )
+# def binarize_on_Youden(adata,
+#                        ref_column='Reference_strand',
+#                        output_layer_name='binarized_methylation'):
+#     """
+#     Binarize SMF values based on position thresholds determined by calculate_position_Youden.
+#     Parameters:
+#         adata (AnnData): The anndata object to binarize. `calculate_position_Youden` must have been run first.
+#         obs_column (str): The obs column to stratify on. Needs to match what was passed in `calculate_position_Youden`.
+#     Modifies:
+#         Adds a new layer to `adata.layers['binarized_methylation']` containing the binarized methylation matrix.
+#     """
+#     import numpy as np
+#     import anndata as ad
+#     # Initialize an empty matrix to store the binarized methylation values
+#     binarized_methylation = np.full_like(adata.X, np.nan, dtype=float)  # Keeps same shape as adata.X
+#     # Get unique categories
+#     references = adata.obs[ref_column].cat.categories
+#     for ref in references:
+#         print(f"Binarizing adata on Youden statistics for {ref}")
+#         # Select subset for this category
+#         ref_mask = adata.obs[ref_column] == ref
+#         ref_subset = adata[ref_mask]
+#         # Extract the probability matrix
+#         original_matrix = ref_subset.X.copy()
-        # Extract the probability matrix
-        original_matrix = ref_subset.X.copy()
+#         # Extract the thresholds for each position efficiently
+#         thresholds = np.array(ref_subset.var[f'{ref}_position_methylation_thresholding_Youden_stats'].apply(lambda x: x[0]))
-        # Extract the thresholds for each position efficiently
-        thresholds = np.array(ref_subset.var[f'{ref}_position_methylation_thresholding_Youden_stats'].apply(lambda x: x[0]))
+#         # Identify NaN values
+#         nan_mask = np.isnan(original_matrix)
-        # Identify NaN values
-        nan_mask = np.isnan(original_matrix)
+#         # Binarize based on threshold
+#         binarized_matrix = (original_matrix > thresholds).astype(float)
-        # Binarize based on threshold
-        binarized_matrix = (original_matrix > thresholds).astype(float)
+#         # Restore NaN values
+#         binarized_matrix[nan_mask] = np.nan
-        # Restore NaN values
-        binarized_matrix[nan_mask] = np.nan
+#         # Assign the binarized values back into the preallocated storage
+#         binarized_methylation[ref_subset, :] = binarized_matrix
-        # Assign the binarized values back into the preallocated storage
-        binarized_methylation[ref_subset, :] = binarized_matrix
+#     # Store the binarized matrix in a new layer
+#     adata.layers[output_layer_name] = binarized_methylation
-    # Store the binarized matrix in a new layer
-    adata.layers[output_layer_name] = binarized_methylation
+#     print(f"Finished binarizing adata on Youden statistics")

smftools/preprocessing/binary_layers_to_ohe.py CHANGED Viewed

@@ -1,28 +1,34 @@
 ## binary_layers_to_ohe
-## Conversion SMF Specific
-def binary_layers_to_ohe(adata, binary_layers, stack='hstack'):
+from smftools.logging_utils import get_logger
+logger = get_logger(__name__)
+## Conversion SMF Specific
+def binary_layers_to_ohe(adata, binary_layers, stack="hstack"):
     """
     Parameters:
         adata (AnnData): Anndata object.
-        binary_layers (list): a list of strings. Each string represents a layer in the adata object. The layer should encode a binary matrix.
+        binary_layers (list): a list of strings. Each string represents a layer in the adata object. The layer should encode a binary matrix.
         stack (str): Dimension to stack the one-hot-encoding. Options include 'hstack' and 'vstack'. Default is 'hstack', since this is more efficient.
     Returns:
         ohe_dict (dict): A dictionary keyed by obs_name that points to a stacked (hstack or vstack) one-hot encoding of the binary layers
     Input: An adata object and a list of layers containing a binary encoding.
     """
     import numpy as np
-    import anndata as ad
     # Ensure that the N layer is last!
     # Grab all binary layers that are not encoding N
-    ACGT_binary_layers = [layer for layer in binary_layers if 'binary' in layer and layer != 'N_binary_encoding']
+    ACGT_binary_layers = [
+        layer for layer in binary_layers if "binary" in layer and layer != "N_binary_encoding"
+    ]
     # If there is a binary layer encoding N, hold it in N_binary_layer
-    N_binary_layer = [layer for layer in binary_layers if layer == 'N_binary_encoding']
+    N_binary_layer = [layer for layer in binary_layers if layer == "N_binary_encoding"]
     # Add the N_binary_encoding layer to the end of the list of binary layers
     all_binary_layers = ACGT_binary_layers + N_binary_layer
-    print(f'Found {all_binary_layers} layers in adata')
+    logger.info("Found %s layers in adata", all_binary_layers)
     # Extract the layers
     layers = [adata.layers[layer_name] for layer_name in all_binary_layers]
@@ -33,8 +39,8 @@ def binary_layers_to_ohe(adata, binary_layers, stack='hstack'):
         for layer in layers:
             read_ohe.append(layer[i])
         read_name = adata.obs_names[i]
-        if stack == 'hstack':
+        if stack == "hstack":
             ohe_dict[read_name] = np.hstack(read_ohe)
-        elif stack == 'vstack':
+        elif stack == "vstack":
             ohe_dict[read_name] = np.vstack(read_ohe)
-    return ohe_dict
+    return ohe_dict

smftools/preprocessing/calculate_complexity_II.py CHANGED Viewed

@@ -1,42 +1,59 @@
-from typing import Optional
+from __future__ import annotations
+from pathlib import Path
+from typing import TYPE_CHECKING, Optional
+if TYPE_CHECKING:
+    import anndata as ad
 def calculate_complexity_II(
-    adata,
-    output_directory='',
-    sample_col='Sample_names',
-    ref_col: Optional[str] = 'Reference_strand',
-    cluster_col='sequence__merged_cluster_id',
-    plot=True,
-    save_plot=False,
-    n_boot=30,
-    n_depths=12,
-    random_state=0,
-    csv_summary=True,
-    uns_flag='calculate_complexity_II_performed',
-    force_redo=False,
-    bypass=False
-):
-    """
-    Estimate and plot library complexity.
+    adata: "ad.AnnData",
+    output_directory: str | Path = "",
+    sample_col: str = "Sample_names",
+    ref_col: Optional[str] = "Reference_strand",
+    cluster_col: str = "sequence__merged_cluster_id",
+    plot: bool = True,
+    save_plot: bool = False,
+    n_boot: int = 30,
+    n_depths: int = 12,
+    random_state: int = 0,
+    csv_summary: bool = True,
+    uns_flag: str = "calculate_complexity_II_performed",
+    force_redo: bool = False,
+    bypass: bool = False,
+) -> None:
+    """Estimate and optionally plot library complexity.
-    If ref_col is None (default), behaves as before: one calculation per sample.
-    If ref_col is provided, computes complexity for each (sample, ref) pair.
+    If ``ref_col`` is ``None``, the calculation is performed per sample. If provided,
+    complexity is computed for each ``(sample, reference)`` pair.
-    Results:
-      - adata.uns['Library_complexity_results'] : dict keyed by (sample,) or (sample, ref) -> dict with fields
-          C0, n_reads, n_unique, depths, mean_unique, ci_low, ci_high
-      - Also stores per-entity record in adata.uns[f'Library_complexity_{sanitized_name}'] (backwards compatible)
-      - Optionally saves PNGs and CSVs (curve points + fit summary)
+    Args:
+        adata: AnnData object containing read metadata.
+        output_directory: Directory for output plots/CSVs.
+        sample_col: Obs column containing sample names.
+        ref_col: Obs column with reference/strand categories, or ``None``.
+        cluster_col: Obs column with merged cluster IDs.
+        plot: Whether to generate plots.
+        save_plot: Whether to save plots to disk.
+        n_boot: Number of bootstrap iterations per depth.
+        n_depths: Number of subsampling depths to evaluate.
+        random_state: Random seed for bootstrapping.
+        csv_summary: Whether to write CSV summary files.
+        uns_flag: Flag in ``adata.uns`` indicating prior completion.
+        force_redo: Whether to rerun even if ``uns_flag`` is present.
+        bypass: Whether to skip processing.
     """
     import os
+    import matplotlib.pyplot as plt
     import numpy as np
     import pandas as pd
-    import matplotlib.pyplot as plt
     from scipy.optimize import curve_fit
-    from datetime import datetime
     # early exits
     already = bool(adata.uns.get(uns_flag, False))
-    if (already and not force_redo):
+    if already and not force_redo:
         return None
     if bypass:
         return None
@@ -44,9 +61,11 @@ def calculate_complexity_II(
     rng = np.random.default_rng(random_state)
     def lw(x, C0):
+        """Lander-Waterman curve for complexity estimation."""
         return C0 * (1.0 - np.exp(-x / C0))
     def sanitize(name: str) -> str:
+        """Sanitize a string for safe filenames."""
         return "".join(c if c.isalnum() or c in "-._" else "_" for c in str(name))
     # checks
@@ -77,7 +96,7 @@ def calculate_complexity_II(
         group_keys = []
         # iterate only pairs that exist in data to avoid empty processing
         for s in samples:
-            mask_s = (adata.obs[sample_col] == s)
+            mask_s = adata.obs[sample_col] == s
             # find references present for this sample
             ref_present = pd.Categorical(adata.obs.loc[mask_s, ref_col]).categories
             # Use intersection of known reference categories and those present for sample
@@ -109,7 +128,7 @@ def calculate_complexity_II(
                 "ci_high": np.array([], dtype=float),
             }
             # also store back-compat key
-            adata.uns[f'Library_complexity_{sanitize(group_label)}'] = results[g]
+            adata.uns[f"Library_complexity_{sanitize(group_label)}"] = results[g]
             continue
         # cluster ids array for this group
@@ -175,39 +194,45 @@ def calculate_complexity_II(
         }
         # save per-group in adata.uns for backward compatibility
-        adata.uns[f'Library_complexity_{sanitize(group_label)}'] = results[g]
+        adata.uns[f"Library_complexity_{sanitize(group_label)}"] = results[g]
         # prepare curve and fit records for CSV
-        fit_records.append({
-            "sample": sample,
-            "reference": ref if ref_col is not None else "",
-            "C0": float(C0),
-            "n_reads": int(n_reads),
-            "n_unique_observed": int(observed_unique),
-        })
+        fit_records.append(
+            {
+                "sample": sample,
+                "reference": ref if ref_col is not None else "",
+                "C0": float(C0),
+                "n_reads": int(n_reads),
+                "n_unique_observed": int(observed_unique),
+            }
+        )
         x_fit = np.linspace(0, max(n_reads, int(depths[-1]) if depths.size else n_reads), 200)
         y_fit = lw(x_fit, C0)
         for d, mu, lo, hi in zip(depths, mean_unique, lo_ci, hi_ci):
-            curve_records.append({
-                "sample": sample,
-                "reference": ref if ref_col is not None else "",
-                "type": "bootstrap",
-                "depth": int(d),
-                "mean_unique": float(mu),
-                "ci_low": float(lo),
-                "ci_high": float(hi),
-            })
+            curve_records.append(
+                {
+                    "sample": sample,
+                    "reference": ref if ref_col is not None else "",
+                    "type": "bootstrap",
+                    "depth": int(d),
+                    "mean_unique": float(mu),
+                    "ci_low": float(lo),
+                    "ci_high": float(hi),
+                }
+            )
         for xf, yf in zip(x_fit, y_fit):
-            curve_records.append({
-                "sample": sample,
-                "reference": ref if ref_col is not None else "",
-                "type": "fit",
-                "depth": float(xf),
-                "mean_unique": float(yf),
-                "ci_low": np.nan,
-                "ci_high": np.nan,
-            })
+            curve_records.append(
+                {
+                    "sample": sample,
+                    "reference": ref if ref_col is not None else "",
+                    "type": "fit",
+                    "depth": float(xf),
+                    "mean_unique": float(yf),
+                    "ci_low": np.nan,
+                    "ci_high": np.nan,
+                }
+            )
         # plotting for this group
         if plot:
@@ -226,7 +251,9 @@ def calculate_complexity_II(
             if save_plot:
                 fname = f"complexity_{sanitize(group_label)}.png"
-                plt.savefig(os.path.join(output_directory or ".", fname), dpi=160, bbox_inches="tight")
+                plt.savefig(
+                    os.path.join(output_directory or ".", fname), dpi=160, bbox_inches="tight"
+                )
                 plt.close()
             else:
                 plt.show()
@@ -242,7 +269,7 @@ def calculate_complexity_II(
         fit_df = pd.DataFrame(fit_records)
         curve_df = pd.DataFrame(curve_records)
         base = output_directory or "."
-        fit_df.to_csv(os.path.join(base, f"complexity_fit_summary.csv"), index=False)
-        curve_df.to_csv(os.path.join(base, f"complexity_curves.csv"), index=False)
+        fit_df.to_csv(os.path.join(base, "complexity_fit_summary.csv"), index=False)
+        curve_df.to_csv(os.path.join(base, "complexity_curves.csv"), index=False)
     return results

smftools/preprocessing/calculate_consensus.py CHANGED Viewed

@@ -1,19 +1,28 @@
 # calculate_consensus
-def calculate_consensus(adata, reference, sample=False, reference_column='Reference', sample_column='Sample'):
-    """
-    Takes an input AnnData object, the reference to subset on, and the sample name to subset on to calculate the consensus sequence of the read set.
-    Parameters:
-        adata (AnnData): The input adata to append consensus metadata to.
-        reference (str): The name of the reference to subset the adata on.
-        sample (bool | str): If False, uses all samples. If a string is passed, the adata is further subsetted to only analyze that sample.
-        reference_column (str): The name of the reference column (Default is 'Reference')
-        sample_column (str): The name of the sample column (Default is 'Sample)
-    Returns:
-        None
+from __future__ import annotations
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    import anndata as ad
+def calculate_consensus(
+    adata: "ad.AnnData",
+    reference: str,
+    sample: str | bool = False,
+    reference_column: str = "Reference",
+    sample_column: str = "Sample",
+) -> None:
+    """Calculate a consensus sequence for a reference (and optional sample).
+    Args:
+        adata: AnnData object to append consensus metadata to.
+        reference: Reference name to subset on.
+        sample: If ``False``, uses all samples. If a string is passed, subsets to that sample.
+        reference_column: Obs column with reference names.
+        sample_column: Obs column with sample names.
     """
     import numpy as np
@@ -25,11 +34,11 @@ def calculate_consensus(adata, reference, sample=False, reference_column='Refere
         pass
     # Grab layer names from the adata object that correspond to the binary encodings of the read sequences.
-    layers = [layer for layer in record_subset.layers if '_binary_' in layer]
+    layers = [layer for layer in record_subset.layers if "_binary_" in layer]
     layer_map, layer_counts = {}, []
     for i, layer in enumerate(layers):
         # Gives an integer mapping to access which sequence base the binary layer is encoding
-        layer_map[i] = layer.split('_')[0]
+        layer_map[i] = layer.split("_")[0]
         # Get the positional counts from all reads for the given base identity.
         layer_counts.append(np.sum(record_subset.layers[layer], axis=0))
     # Combine the positional counts array derived from each binary base layer into an ndarray
@@ -40,8 +49,8 @@ def calculate_consensus(adata, reference, sample=False, reference_column='Refere
     consensus_sequence_list = [layer_map[i] for i in nucleotide_indexes]
     if sample:
-        adata.var[f'{reference}_consensus_from_{sample}'] = consensus_sequence_list
+        adata.var[f"{reference}_consensus_from_{sample}"] = consensus_sequence_list
     else:
-        adata.var[f'{reference}_consensus_across_samples'] = consensus_sequence_list
+        adata.var[f"{reference}_consensus_across_samples"] = consensus_sequence_list
-    adata.uns[f'{reference}_consensus_sequence'] = consensus_sequence_list
+    adata.uns[f"{reference}_consensus_sequence"] = consensus_sequence_list

smftools/preprocessing/calculate_coverage.py CHANGED Viewed

@@ -1,54 +1,76 @@
-def calculate_coverage(adata,
-                       ref_column='Reference_strand',
-                       position_nan_threshold=0.01,
-                       uns_flag='calculate_coverage_performed'):
-    """
-    Append position-level metadata regarding whether the position is informative within the given observation category.
+from __future__ import annotations
+from typing import TYPE_CHECKING
+from smftools.logging_utils import get_logger
+if TYPE_CHECKING:
+    import anndata as ad
-    Parameters:
-        adata (AnnData): An AnnData object
-        obs_column (str): Observation column value to subset on prior to calculating position statistics for that category.
-        position_nan_threshold (float): A minimal fractional threshold of coverage within the obs_column category to call the position as valid.
+logger = get_logger(__name__)
-    Modifies:
-        - Adds new columns to `adata.var` containing coverage statistics.
+def calculate_coverage(
+    adata: "ad.AnnData",
+    ref_column: str = "Reference_strand",
+    position_nan_threshold: float = 0.01,
+    smf_modality: str = "deaminase",
+    target_layer: str = "binarized_methylation",
+    uns_flag: str = "calculate_coverage_performed",
+    force_redo: bool = False,
+) -> None:
+    """Append position-level coverage metadata per reference category.
+    Args:
+        adata: AnnData object.
+        ref_column: Obs column used to define reference/strand categories.
+        position_nan_threshold: Minimum fraction of coverage to mark a position as valid.
+        smf_modality: SMF modality. Use ``adata.X`` for conversion/deaminase or ``target_layer`` for direct.
+        target_layer: Layer used for direct SMF coverage calculations.
+        uns_flag: Flag in ``adata.uns`` indicating prior completion.
+        force_redo: Whether to rerun even if ``uns_flag`` is set.
     """
     import numpy as np
     import pandas as pd
-    import anndata as ad
     # Only run if not already performed
     already = bool(adata.uns.get(uns_flag, False))
-    if already:
+    if already and not force_redo:
         # QC already performed; nothing to do
         return
     references = adata.obs[ref_column].cat.categories
     n_categories_with_position = np.zeros(adata.shape[1])
     # Loop over references
     for ref in references:
-        print(f'Assessing positional coverage across samples for {ref} reference')
+        logger.info("Assessing positional coverage across samples for %s reference", ref)
         # Subset to current category
         ref_mask = adata.obs[ref_column] == ref
         temp_ref_adata = adata[ref_mask]
+        if smf_modality == "direct":
+            matrix = temp_ref_adata.layers[target_layer]
+        else:
+            matrix = temp_ref_adata.X
         # Compute fraction of valid coverage
-        ref_valid_coverage = np.sum(~np.isnan(temp_ref_adata.X), axis=0)
+        ref_valid_coverage = np.sum(~np.isnan(matrix), axis=0)
         ref_valid_fraction = ref_valid_coverage / temp_ref_adata.shape[0]  # Avoid extra computation
         # Store coverage stats
-        adata.var[f'{ref}_valid_fraction'] = pd.Series(ref_valid_fraction, index=adata.var.index)
+        adata.var[f"{ref}_valid_count"] = pd.Series(ref_valid_coverage, index=adata.var.index)
+        adata.var[f"{ref}_valid_fraction"] = pd.Series(ref_valid_fraction, index=adata.var.index)
         # Assign whether the position is covered based on threshold
-        adata.var[f'position_in_{ref}'] = ref_valid_fraction >= position_nan_threshold
+        adata.var[f"position_in_{ref}"] = ref_valid_fraction >= position_nan_threshold
         # Sum the number of categories covering each position
-        n_categories_with_position += adata.var[f'position_in_{ref}'].values
+        n_categories_with_position += adata.var[f"position_in_{ref}"].values
     # Store final category count
-    adata.var[f'N_{ref_column}_with_position'] = n_categories_with_position.astype(int)
+    adata.var[f"N_{ref_column}_with_position"] = n_categories_with_position.astype(int)
     # mark as done
-    adata.uns[uns_flag] = True
+    adata.uns[uns_flag] = True

smftools/preprocessing/calculate_pairwise_differences.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # calculate_pairwise_differences
 def calculate_pairwise_differences(arrays):
     """
     Calculate the pairwise differences for a list of h-stacked ndarrays. Ignore N-positions
@@ -41,7 +42,7 @@ def calculate_pairwise_differences(arrays):
             # Calculate the hamming distance directly with boolean operations
             differences = (array_i != array_j) & ~combined_mask
             distance = np.sum(differences) / np.sum(~combined_mask)
             # Store the symmetric distances
             distance_matrix[i, j] = distance
             distance_matrix[j, i] = distance

smftools/preprocessing/calculate_pairwise_hamming_distances.py CHANGED Viewed

@@ -1,6 +1,6 @@
 ## calculate_pairwise_hamming_distances
-## Conversion SMF Specific
+## Conversion SMF Specific
 def calculate_pairwise_hamming_distances(arrays):
     """
     Calculate the pairwise Hamming distances for a list of h-stacked ndarrays.
@@ -13,8 +13,9 @@ def calculate_pairwise_hamming_distances(arrays):
     """
     import numpy as np
-    from tqdm import tqdm
     from scipy.spatial.distance import hamming
+    from tqdm import tqdm
     num_arrays = len(arrays)
     # Initialize an empty distance matrix
     distance_matrix = np.zeros((num_arrays, num_arrays))
@@ -24,4 +25,4 @@ def calculate_pairwise_hamming_distances(arrays):
             distance = hamming(arrays[i], arrays[j])
             distance_matrix[i, j] = distance
             distance_matrix[j, i] = distance
-    return distance_matrix
+    return distance_matrix

smftools 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl

smftools 0.2.4py3-none-any.whl → 0.2.5py3-none-any.whl