PyPI - smftools - Versions diffs - 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

smftools 0.2.4py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (181) hide show

smftools/__init__.py +43 -13
smftools/_settings.py +6 -6
smftools/_version.py +3 -1
smftools/cli/__init__.py +1 -0
smftools/cli/archived/cli_flows.py +2 -0
smftools/cli/helpers.py +9 -1
smftools/cli/hmm_adata.py +905 -242
smftools/cli/load_adata.py +432 -280
smftools/cli/preprocess_adata.py +287 -171
smftools/cli/spatial_adata.py +141 -53
smftools/cli_entry.py +119 -178
smftools/config/__init__.py +3 -1
smftools/config/conversion.yaml +5 -1
smftools/config/deaminase.yaml +1 -1
smftools/config/default.yaml +26 -18
smftools/config/direct.yaml +8 -3
smftools/config/discover_input_files.py +19 -5
smftools/config/experiment_config.py +511 -276
smftools/constants.py +37 -0
smftools/datasets/__init__.py +4 -8
smftools/datasets/datasets.py +32 -18
smftools/hmm/HMM.py +2133 -1428
smftools/hmm/__init__.py +24 -14
smftools/hmm/archived/apply_hmm_batched.py +2 -0
smftools/hmm/archived/calculate_distances.py +2 -0
smftools/hmm/archived/call_hmm_peaks.py +18 -1
smftools/hmm/archived/train_hmm.py +2 -0
smftools/hmm/call_hmm_peaks.py +176 -193
smftools/hmm/display_hmm.py +23 -7
smftools/hmm/hmm_readwrite.py +20 -6
smftools/hmm/nucleosome_hmm_refinement.py +104 -14
smftools/informatics/__init__.py +55 -13
smftools/informatics/archived/bam_conversion.py +2 -0
smftools/informatics/archived/bam_direct.py +2 -0
smftools/informatics/archived/basecall_pod5s.py +2 -0
smftools/informatics/archived/basecalls_to_adata.py +2 -0
smftools/informatics/archived/conversion_smf.py +2 -0
smftools/informatics/archived/deaminase_smf.py +1 -0
smftools/informatics/archived/direct_smf.py +2 -0
smftools/informatics/archived/fast5_to_pod5.py +2 -0
smftools/informatics/archived/helpers/archived/__init__.py +2 -0
smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +16 -1
smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
smftools/informatics/archived/helpers/archived/informatics.py +2 -0
smftools/informatics/archived/helpers/archived/load_adata.py +5 -3
smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
smftools/informatics/archived/helpers/archived/modQC.py +2 -0
smftools/informatics/archived/helpers/archived/modcall.py +2 -0
smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +5 -1
smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
smftools/informatics/archived/print_bam_query_seq.py +9 -1
smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
smftools/informatics/archived/subsample_pod5.py +2 -0
smftools/informatics/bam_functions.py +1059 -269
smftools/informatics/basecalling.py +53 -9
smftools/informatics/bed_functions.py +357 -114
smftools/informatics/binarize_converted_base_identities.py +21 -7
smftools/informatics/complement_base_list.py +9 -6
smftools/informatics/converted_BAM_to_adata.py +324 -137
smftools/informatics/fasta_functions.py +251 -89
smftools/informatics/h5ad_functions.py +202 -30
smftools/informatics/modkit_extract_to_adata.py +623 -274
smftools/informatics/modkit_functions.py +87 -44
smftools/informatics/ohe.py +46 -21
smftools/informatics/pod5_functions.py +114 -74
smftools/informatics/run_multiqc.py +20 -14
smftools/logging_utils.py +51 -0
smftools/machine_learning/__init__.py +23 -12
smftools/machine_learning/data/__init__.py +2 -0
smftools/machine_learning/data/anndata_data_module.py +157 -50
smftools/machine_learning/data/preprocessing.py +4 -1
smftools/machine_learning/evaluation/__init__.py +3 -1
smftools/machine_learning/evaluation/eval_utils.py +13 -14
smftools/machine_learning/evaluation/evaluators.py +52 -34
smftools/machine_learning/inference/__init__.py +3 -1
smftools/machine_learning/inference/inference_utils.py +9 -4
smftools/machine_learning/inference/lightning_inference.py +14 -13
smftools/machine_learning/inference/sklearn_inference.py +8 -8
smftools/machine_learning/inference/sliding_window_inference.py +37 -25
smftools/machine_learning/models/__init__.py +12 -5
smftools/machine_learning/models/base.py +34 -43
smftools/machine_learning/models/cnn.py +22 -13
smftools/machine_learning/models/lightning_base.py +78 -42
smftools/machine_learning/models/mlp.py +18 -5
smftools/machine_learning/models/positional.py +10 -4
smftools/machine_learning/models/rnn.py +8 -3
smftools/machine_learning/models/sklearn_models.py +46 -24
smftools/machine_learning/models/transformer.py +75 -55
smftools/machine_learning/models/wrappers.py +8 -3
smftools/machine_learning/training/__init__.py +4 -2
smftools/machine_learning/training/train_lightning_model.py +42 -23
smftools/machine_learning/training/train_sklearn_model.py +11 -15
smftools/machine_learning/utils/__init__.py +3 -1
smftools/machine_learning/utils/device.py +12 -5
smftools/machine_learning/utils/grl.py +8 -2
smftools/metadata.py +443 -0
smftools/optional_imports.py +31 -0
smftools/plotting/__init__.py +32 -17
smftools/plotting/autocorrelation_plotting.py +153 -48
smftools/plotting/classifiers.py +175 -73
smftools/plotting/general_plotting.py +350 -168
smftools/plotting/hmm_plotting.py +53 -14
smftools/plotting/position_stats.py +155 -87
smftools/plotting/qc_plotting.py +25 -12
smftools/preprocessing/__init__.py +35 -37
smftools/preprocessing/append_base_context.py +105 -79
smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +2 -0
smftools/preprocessing/{archives → archived}/calculate_complexity.py +5 -1
smftools/preprocessing/{archives → archived}/mark_duplicates.py +2 -0
smftools/preprocessing/{archives → archived}/preprocessing.py +10 -6
smftools/preprocessing/{archives → archived}/remove_duplicates.py +2 -0
smftools/preprocessing/binarize.py +21 -4
smftools/preprocessing/binarize_on_Youden.py +127 -31
smftools/preprocessing/binary_layers_to_ohe.py +18 -11
smftools/preprocessing/calculate_complexity_II.py +89 -59
smftools/preprocessing/calculate_consensus.py +28 -19
smftools/preprocessing/calculate_coverage.py +44 -22
smftools/preprocessing/calculate_pairwise_differences.py +4 -1
smftools/preprocessing/calculate_pairwise_hamming_distances.py +7 -3
smftools/preprocessing/calculate_position_Youden.py +110 -55
smftools/preprocessing/calculate_read_length_stats.py +52 -23
smftools/preprocessing/calculate_read_modification_stats.py +91 -57
smftools/preprocessing/clean_NaN.py +38 -28
smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
smftools/preprocessing/filter_reads_on_length_quality_mapping.py +72 -37
smftools/preprocessing/filter_reads_on_modification_thresholds.py +183 -73
smftools/preprocessing/flag_duplicate_reads.py +708 -303
smftools/preprocessing/invert_adata.py +26 -11
smftools/preprocessing/load_sample_sheet.py +40 -22
smftools/preprocessing/make_dirs.py +9 -3
smftools/preprocessing/min_non_diagonal.py +4 -1
smftools/preprocessing/recipes.py +58 -23
smftools/preprocessing/reindex_references_adata.py +93 -27
smftools/preprocessing/subsample_adata.py +33 -16
smftools/readwrite.py +264 -109
smftools/schema/__init__.py +11 -0
smftools/schema/anndata_schema_v1.yaml +227 -0
smftools/tools/__init__.py +25 -18
smftools/tools/archived/apply_hmm.py +2 -0
smftools/tools/archived/classifiers.py +165 -0
smftools/tools/archived/classify_methylated_features.py +2 -0
smftools/tools/archived/classify_non_methylated_features.py +2 -0
smftools/tools/archived/subset_adata_v1.py +12 -1
smftools/tools/archived/subset_adata_v2.py +14 -1
smftools/tools/calculate_umap.py +56 -15
smftools/tools/cluster_adata_on_methylation.py +122 -47
smftools/tools/general_tools.py +70 -25
smftools/tools/position_stats.py +220 -99
smftools/tools/read_stats.py +50 -29
smftools/tools/spatial_autocorrelation.py +365 -192
smftools/tools/subset_adata.py +23 -21
smftools-0.3.0.dist-info/METADATA +147 -0
smftools-0.3.0.dist-info/RECORD +182 -0
smftools-0.2.4.dist-info/METADATA +0 -141
smftools-0.2.4.dist-info/RECORD +0 -176
{smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
{smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
{smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0

smftools/preprocessing/{archives → archived}/preprocessing.py RENAMED Viewed

@@ -1,3 +1,5 @@
+from __future__ import annotations
 ## preprocessing
 from .. import readwrite
@@ -322,12 +324,14 @@ def min_non_diagonal(matrix):
         min_values.append(np.min(row))
     return min_values
-def lander_waterman(x, C0):
-    return C0 * (1 - np.exp(-x / C0))
+    def lander_waterman(x, C0):
+        """Lander-Waterman curve for complexity estimation."""
+        return C0 * (1 - np.exp(-x / C0))
-def count_unique_reads(reads, depth):
-    subsample = np.random.choice(reads, depth, replace=False)
-    return len(np.unique(subsample))
+    def count_unique_reads(reads, depth):
+        """Count unique reads in a subsample of the given depth."""
+        subsample = np.random.choice(reads, depth, replace=False)
+        return len(np.unique(subsample))
 def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_names'):
     """
@@ -611,4 +615,4 @@ def binarize_on_Youden(adata, obs_column='Reference'):
     # Pull back the new binarized layers into the original adata object
     adata.layers['binarized_methylation'] = temp_adata.layers['binarized_methylation']
-######################################################################################################
+######################################################################################################

smftools/preprocessing/{archives → archived}/remove_duplicates.py RENAMED Viewed

@@ -1,3 +1,5 @@
+from __future__ import annotations
 # remove_duplicates
 def remove_duplicates(adata):

smftools/preprocessing/binarize.py CHANGED Viewed

@@ -1,9 +1,26 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
 import numpy as np
-def binarize_adata(adata, source="X", target_layer="binary", threshold=0.8):
-    """
-    Binarize a dense matrix and preserve NaN.
-    source: "X" or layer name
+if TYPE_CHECKING:
+    import anndata as ad
+def binarize_adata(
+    adata: "ad.AnnData",
+    source: str = "X",
+    target_layer: str = "binary",
+    threshold: float = 0.8,
+) -> None:
+    """Binarize a dense matrix and preserve NaNs.
+    Args:
+        adata: AnnData object with input matrix or layer.
+        source: ``"X"`` to use the main matrix or a layer name.
+        target_layer: Layer name to store the binarized values.
+        threshold: Threshold above which values are set to 1.
     """
     X = adata.X if source == "X" else adata.layers[source]

smftools/preprocessing/binarize_on_Youden.py CHANGED Viewed

@@ -1,47 +1,143 @@
-def binarize_on_Youden(adata,
-                       ref_column='Reference_strand',
-                       output_layer_name='binarized_methylation'):
-    """
-    Binarize SMF values based on position thresholds determined by calculate_position_Youden.
+from __future__ import annotations
+from typing import TYPE_CHECKING
+from smftools.logging_utils import get_logger
+if TYPE_CHECKING:
+    import anndata as ad
-    Parameters:
-        adata (AnnData): The anndata object to binarize. `calculate_position_Youden` must have been run first.
-        obs_column (str): The obs column to stratify on. Needs to match what was passed in `calculate_position_Youden`.
+logger = get_logger(__name__)
-    Modifies:
-        Adds a new layer to `adata.layers['binarized_methylation']` containing the binarized methylation matrix.
+def binarize_on_Youden(
+    adata: "ad.AnnData",
+    ref_column: str = "Reference_strand",
+    output_layer_name: str = "binarized_methylation",
+    mask_failed_positions: bool = True,
+) -> None:
+    """Binarize SMF values using thresholds from ``calculate_position_Youden``.
+    Args:
+        adata: AnnData object to binarize.
+        ref_column: Obs column denoting reference/strand categories.
+        output_layer_name: Layer in which to store the binarized matrix.
+        mask_failed_positions: If ``True``, positions that failed Youden QC are set to NaN;
+            otherwise all positions are binarized.
     """
     import numpy as np
-    import anndata as ad
-    # Initialize an empty matrix to store the binarized methylation values
-    binarized_methylation = np.full_like(adata.X, np.nan, dtype=float)  # Keeps same shape as adata.X
+    # Extract dense X once
+    X = adata.X
+    if hasattr(X, "toarray"):  # sparse → dense
+        X = X.toarray()
+    n_obs, n_var = X.shape
+    binarized = np.full((n_obs, n_var), np.nan, dtype=float)
-    # Get unique categories
     references = adata.obs[ref_column].cat.categories
+    ref_labels = adata.obs[ref_column].to_numpy()
     for ref in references:
-        # Select subset for this category
-        ref_mask = adata.obs[ref_column] == ref
-        ref_subset = adata[ref_mask]
+        logger.info("Binarizing on Youden statistics for %s", ref)
+        ref_mask = ref_labels == ref
+        if not np.any(ref_mask):
+            continue
+        X_block = X[ref_mask, :].astype(float, copy=True)
+        # thresholds: list of (threshold, J)
+        youden_stats = adata.var[f"{ref}_position_methylation_thresholding_Youden_stats"].to_numpy()
+        thresholds = np.array(
+            [t[0] if isinstance(t, (tuple, list)) else np.nan for t in youden_stats],
+            dtype=float,
+        )
+        # QC mask
+        qc_mask = adata.var[f"{ref}_position_passed_Youden_thresholding_QC"].to_numpy().astype(bool)
+        if mask_failed_positions:
+            # Only binarize positions passing QC
+            cols_to_binarize = np.where(qc_mask)[0]
+        else:
+            # Binarize all positions
+            cols_to_binarize = np.arange(n_var)
+        # Prepare result block
+        block_out = np.full_like(X_block, np.nan, dtype=float)
+        if len(cols_to_binarize) > 0:
+            sub_X = X_block[:, cols_to_binarize]
+            sub_thresh = thresholds[cols_to_binarize]
+            nan_mask = np.isnan(sub_X)
+            bin_sub = (sub_X > sub_thresh[None, :]).astype(float)
+            bin_sub[nan_mask] = np.nan
+            block_out[:, cols_to_binarize] = bin_sub
+        # Write into full output matrix
+        binarized[ref_mask, :] = block_out
+    adata.layers[output_layer_name] = binarized
+    logger.info(
+        "Finished binarization → stored in adata.layers['%s'] (mask_failed_positions=%s)",
+        output_layer_name,
+        mask_failed_positions,
+    )
+# def binarize_on_Youden(adata,
+#                        ref_column='Reference_strand',
+#                        output_layer_name='binarized_methylation'):
+#     """
+#     Binarize SMF values based on position thresholds determined by calculate_position_Youden.
+#     Parameters:
+#         adata (AnnData): The anndata object to binarize. `calculate_position_Youden` must have been run first.
+#         obs_column (str): The obs column to stratify on. Needs to match what was passed in `calculate_position_Youden`.
+#     Modifies:
+#         Adds a new layer to `adata.layers['binarized_methylation']` containing the binarized methylation matrix.
+#     """
+#     import numpy as np
+#     import anndata as ad
+#     # Initialize an empty matrix to store the binarized methylation values
+#     binarized_methylation = np.full_like(adata.X, np.nan, dtype=float)  # Keeps same shape as adata.X
+#     # Get unique categories
+#     references = adata.obs[ref_column].cat.categories
+#     for ref in references:
+#         print(f"Binarizing adata on Youden statistics for {ref}")
+#         # Select subset for this category
+#         ref_mask = adata.obs[ref_column] == ref
+#         ref_subset = adata[ref_mask]
+#         # Extract the probability matrix
+#         original_matrix = ref_subset.X.copy()
-        # Extract the probability matrix
-        original_matrix = ref_subset.X.copy()
+#         # Extract the thresholds for each position efficiently
+#         thresholds = np.array(ref_subset.var[f'{ref}_position_methylation_thresholding_Youden_stats'].apply(lambda x: x[0]))
-        # Extract the thresholds for each position efficiently
-        thresholds = np.array(ref_subset.var[f'{ref}_position_methylation_thresholding_Youden_stats'].apply(lambda x: x[0]))
+#         # Identify NaN values
+#         nan_mask = np.isnan(original_matrix)
-        # Identify NaN values
-        nan_mask = np.isnan(original_matrix)
+#         # Binarize based on threshold
+#         binarized_matrix = (original_matrix > thresholds).astype(float)
-        # Binarize based on threshold
-        binarized_matrix = (original_matrix > thresholds).astype(float)
+#         # Restore NaN values
+#         binarized_matrix[nan_mask] = np.nan
-        # Restore NaN values
-        binarized_matrix[nan_mask] = np.nan
+#         # Assign the binarized values back into the preallocated storage
+#         binarized_methylation[ref_subset, :] = binarized_matrix
-        # Assign the binarized values back into the preallocated storage
-        binarized_methylation[ref_subset, :] = binarized_matrix
+#     # Store the binarized matrix in a new layer
+#     adata.layers[output_layer_name] = binarized_methylation
-    # Store the binarized matrix in a new layer
-    adata.layers[output_layer_name] = binarized_methylation
+#     print(f"Finished binarizing adata on Youden statistics")

smftools/preprocessing/binary_layers_to_ohe.py CHANGED Viewed

@@ -1,28 +1,35 @@
+from __future__ import annotations
 ## binary_layers_to_ohe
+from smftools.logging_utils import get_logger
+logger = get_logger(__name__)
-## Conversion SMF Specific
-def binary_layers_to_ohe(adata, binary_layers, stack='hstack'):
+## Conversion SMF Specific
+def binary_layers_to_ohe(adata, binary_layers, stack="hstack"):
     """
     Parameters:
         adata (AnnData): Anndata object.
-        binary_layers (list): a list of strings. Each string represents a layer in the adata object. The layer should encode a binary matrix.
+        binary_layers (list): a list of strings. Each string represents a layer in the adata object. The layer should encode a binary matrix.
         stack (str): Dimension to stack the one-hot-encoding. Options include 'hstack' and 'vstack'. Default is 'hstack', since this is more efficient.
     Returns:
         ohe_dict (dict): A dictionary keyed by obs_name that points to a stacked (hstack or vstack) one-hot encoding of the binary layers
     Input: An adata object and a list of layers containing a binary encoding.
     """
     import numpy as np
-    import anndata as ad
     # Ensure that the N layer is last!
     # Grab all binary layers that are not encoding N
-    ACGT_binary_layers = [layer for layer in binary_layers if 'binary' in layer and layer != 'N_binary_encoding']
+    ACGT_binary_layers = [
+        layer for layer in binary_layers if "binary" in layer and layer != "N_binary_encoding"
+    ]
     # If there is a binary layer encoding N, hold it in N_binary_layer
-    N_binary_layer = [layer for layer in binary_layers if layer == 'N_binary_encoding']
+    N_binary_layer = [layer for layer in binary_layers if layer == "N_binary_encoding"]
     # Add the N_binary_encoding layer to the end of the list of binary layers
     all_binary_layers = ACGT_binary_layers + N_binary_layer
-    print(f'Found {all_binary_layers} layers in adata')
+    logger.info("Found %s layers in adata", all_binary_layers)
     # Extract the layers
     layers = [adata.layers[layer_name] for layer_name in all_binary_layers]
@@ -33,8 +40,8 @@ def binary_layers_to_ohe(adata, binary_layers, stack='hstack'):
         for layer in layers:
             read_ohe.append(layer[i])
         read_name = adata.obs_names[i]
-        if stack == 'hstack':
+        if stack == "hstack":
             ohe_dict[read_name] = np.hstack(read_ohe)
-        elif stack == 'vstack':
+        elif stack == "vstack":
             ohe_dict[read_name] = np.vstack(read_ohe)
-    return ohe_dict
+    return ohe_dict

smftools/preprocessing/calculate_complexity_II.py CHANGED Viewed

@@ -1,42 +1,62 @@
-from typing import Optional
+from __future__ import annotations
+from pathlib import Path
+from typing import TYPE_CHECKING, Optional
+from smftools.optional_imports import require
+if TYPE_CHECKING:
+    import anndata as ad
 def calculate_complexity_II(
-    adata,
-    output_directory='',
-    sample_col='Sample_names',
-    ref_col: Optional[str] = 'Reference_strand',
-    cluster_col='sequence__merged_cluster_id',
-    plot=True,
-    save_plot=False,
-    n_boot=30,
-    n_depths=12,
-    random_state=0,
-    csv_summary=True,
-    uns_flag='calculate_complexity_II_performed',
-    force_redo=False,
-    bypass=False
-):
-    """
-    Estimate and plot library complexity.
+    adata: "ad.AnnData",
+    output_directory: str | Path = "",
+    sample_col: str = "Sample_names",
+    ref_col: Optional[str] = "Reference_strand",
+    cluster_col: str = "sequence__merged_cluster_id",
+    plot: bool = True,
+    save_plot: bool = False,
+    n_boot: int = 30,
+    n_depths: int = 12,
+    random_state: int = 0,
+    csv_summary: bool = True,
+    uns_flag: str = "calculate_complexity_II_performed",
+    force_redo: bool = False,
+    bypass: bool = False,
+) -> None:
+    """Estimate and optionally plot library complexity.
-    If ref_col is None (default), behaves as before: one calculation per sample.
-    If ref_col is provided, computes complexity for each (sample, ref) pair.
+    If ``ref_col`` is ``None``, the calculation is performed per sample. If provided,
+    complexity is computed for each ``(sample, reference)`` pair.
-    Results:
-      - adata.uns['Library_complexity_results'] : dict keyed by (sample,) or (sample, ref) -> dict with fields
-          C0, n_reads, n_unique, depths, mean_unique, ci_low, ci_high
-      - Also stores per-entity record in adata.uns[f'Library_complexity_{sanitized_name}'] (backwards compatible)
-      - Optionally saves PNGs and CSVs (curve points + fit summary)
+    Args:
+        adata: AnnData object containing read metadata.
+        output_directory: Directory for output plots/CSVs.
+        sample_col: Obs column containing sample names.
+        ref_col: Obs column with reference/strand categories, or ``None``.
+        cluster_col: Obs column with merged cluster IDs.
+        plot: Whether to generate plots.
+        save_plot: Whether to save plots to disk.
+        n_boot: Number of bootstrap iterations per depth.
+        n_depths: Number of subsampling depths to evaluate.
+        random_state: Random seed for bootstrapping.
+        csv_summary: Whether to write CSV summary files.
+        uns_flag: Flag in ``adata.uns`` indicating prior completion.
+        force_redo: Whether to rerun even if ``uns_flag`` is present.
+        bypass: Whether to skip processing.
     """
     import os
     import numpy as np
     import pandas as pd
-    import matplotlib.pyplot as plt
     from scipy.optimize import curve_fit
-    from datetime import datetime
+    plt = require("matplotlib.pyplot", extra="plotting", purpose="complexity plots")
     # early exits
     already = bool(adata.uns.get(uns_flag, False))
-    if (already and not force_redo):
+    if already and not force_redo:
         return None
     if bypass:
         return None
@@ -44,9 +64,11 @@ def calculate_complexity_II(
     rng = np.random.default_rng(random_state)
     def lw(x, C0):
+        """Lander-Waterman curve for complexity estimation."""
         return C0 * (1.0 - np.exp(-x / C0))
     def sanitize(name: str) -> str:
+        """Sanitize a string for safe filenames."""
         return "".join(c if c.isalnum() or c in "-._" else "_" for c in str(name))
     # checks
@@ -77,7 +99,7 @@ def calculate_complexity_II(
         group_keys = []
         # iterate only pairs that exist in data to avoid empty processing
         for s in samples:
-            mask_s = (adata.obs[sample_col] == s)
+            mask_s = adata.obs[sample_col] == s
             # find references present for this sample
             ref_present = pd.Categorical(adata.obs.loc[mask_s, ref_col]).categories
             # Use intersection of known reference categories and those present for sample
@@ -109,7 +131,7 @@ def calculate_complexity_II(
                 "ci_high": np.array([], dtype=float),
             }
             # also store back-compat key
-            adata.uns[f'Library_complexity_{sanitize(group_label)}'] = results[g]
+            adata.uns[f"Library_complexity_{sanitize(group_label)}"] = results[g]
             continue
         # cluster ids array for this group
@@ -175,39 +197,45 @@ def calculate_complexity_II(
         }
         # save per-group in adata.uns for backward compatibility
-        adata.uns[f'Library_complexity_{sanitize(group_label)}'] = results[g]
+        adata.uns[f"Library_complexity_{sanitize(group_label)}"] = results[g]
         # prepare curve and fit records for CSV
-        fit_records.append({
-            "sample": sample,
-            "reference": ref if ref_col is not None else "",
-            "C0": float(C0),
-            "n_reads": int(n_reads),
-            "n_unique_observed": int(observed_unique),
-        })
+        fit_records.append(
+            {
+                "sample": sample,
+                "reference": ref if ref_col is not None else "",
+                "C0": float(C0),
+                "n_reads": int(n_reads),
+                "n_unique_observed": int(observed_unique),
+            }
+        )
         x_fit = np.linspace(0, max(n_reads, int(depths[-1]) if depths.size else n_reads), 200)
         y_fit = lw(x_fit, C0)
         for d, mu, lo, hi in zip(depths, mean_unique, lo_ci, hi_ci):
-            curve_records.append({
-                "sample": sample,
-                "reference": ref if ref_col is not None else "",
-                "type": "bootstrap",
-                "depth": int(d),
-                "mean_unique": float(mu),
-                "ci_low": float(lo),
-                "ci_high": float(hi),
-            })
+            curve_records.append(
+                {
+                    "sample": sample,
+                    "reference": ref if ref_col is not None else "",
+                    "type": "bootstrap",
+                    "depth": int(d),
+                    "mean_unique": float(mu),
+                    "ci_low": float(lo),
+                    "ci_high": float(hi),
+                }
+            )
         for xf, yf in zip(x_fit, y_fit):
-            curve_records.append({
-                "sample": sample,
-                "reference": ref if ref_col is not None else "",
-                "type": "fit",
-                "depth": float(xf),
-                "mean_unique": float(yf),
-                "ci_low": np.nan,
-                "ci_high": np.nan,
-            })
+            curve_records.append(
+                {
+                    "sample": sample,
+                    "reference": ref if ref_col is not None else "",
+                    "type": "fit",
+                    "depth": float(xf),
+                    "mean_unique": float(yf),
+                    "ci_low": np.nan,
+                    "ci_high": np.nan,
+                }
+            )
         # plotting for this group
         if plot:
@@ -226,7 +254,9 @@ def calculate_complexity_II(
             if save_plot:
                 fname = f"complexity_{sanitize(group_label)}.png"
-                plt.savefig(os.path.join(output_directory or ".", fname), dpi=160, bbox_inches="tight")
+                plt.savefig(
+                    os.path.join(output_directory or ".", fname), dpi=160, bbox_inches="tight"
+                )
                 plt.close()
             else:
                 plt.show()
@@ -242,7 +272,7 @@ def calculate_complexity_II(
         fit_df = pd.DataFrame(fit_records)
         curve_df = pd.DataFrame(curve_records)
         base = output_directory or "."
-        fit_df.to_csv(os.path.join(base, f"complexity_fit_summary.csv"), index=False)
-        curve_df.to_csv(os.path.join(base, f"complexity_curves.csv"), index=False)
+        fit_df.to_csv(os.path.join(base, "complexity_fit_summary.csv"), index=False)
+        curve_df.to_csv(os.path.join(base, "complexity_curves.csv"), index=False)
     return results

smftools/preprocessing/calculate_consensus.py CHANGED Viewed

@@ -1,19 +1,28 @@
 # calculate_consensus
-def calculate_consensus(adata, reference, sample=False, reference_column='Reference', sample_column='Sample'):
-    """
-    Takes an input AnnData object, the reference to subset on, and the sample name to subset on to calculate the consensus sequence of the read set.
-    Parameters:
-        adata (AnnData): The input adata to append consensus metadata to.
-        reference (str): The name of the reference to subset the adata on.
-        sample (bool | str): If False, uses all samples. If a string is passed, the adata is further subsetted to only analyze that sample.
-        reference_column (str): The name of the reference column (Default is 'Reference')
-        sample_column (str): The name of the sample column (Default is 'Sample)
-    Returns:
-        None
+from __future__ import annotations
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    import anndata as ad
+def calculate_consensus(
+    adata: "ad.AnnData",
+    reference: str,
+    sample: str | bool = False,
+    reference_column: str = "Reference",
+    sample_column: str = "Sample",
+) -> None:
+    """Calculate a consensus sequence for a reference (and optional sample).
+    Args:
+        adata: AnnData object to append consensus metadata to.
+        reference: Reference name to subset on.
+        sample: If ``False``, uses all samples. If a string is passed, subsets to that sample.
+        reference_column: Obs column with reference names.
+        sample_column: Obs column with sample names.
     """
     import numpy as np
@@ -25,11 +34,11 @@ def calculate_consensus(adata, reference, sample=False, reference_column='Refere
         pass
     # Grab layer names from the adata object that correspond to the binary encodings of the read sequences.
-    layers = [layer for layer in record_subset.layers if '_binary_' in layer]
+    layers = [layer for layer in record_subset.layers if "_binary_" in layer]
     layer_map, layer_counts = {}, []
     for i, layer in enumerate(layers):
         # Gives an integer mapping to access which sequence base the binary layer is encoding
-        layer_map[i] = layer.split('_')[0]
+        layer_map[i] = layer.split("_")[0]
         # Get the positional counts from all reads for the given base identity.
         layer_counts.append(np.sum(record_subset.layers[layer], axis=0))
     # Combine the positional counts array derived from each binary base layer into an ndarray
@@ -40,8 +49,8 @@ def calculate_consensus(adata, reference, sample=False, reference_column='Refere
     consensus_sequence_list = [layer_map[i] for i in nucleotide_indexes]
     if sample:
-        adata.var[f'{reference}_consensus_from_{sample}'] = consensus_sequence_list
+        adata.var[f"{reference}_consensus_from_{sample}"] = consensus_sequence_list
     else:
-        adata.var[f'{reference}_consensus_across_samples'] = consensus_sequence_list
+        adata.var[f"{reference}_consensus_across_samples"] = consensus_sequence_list
-    adata.uns[f'{reference}_consensus_sequence'] = consensus_sequence_list
+    adata.uns[f"{reference}_consensus_sequence"] = consensus_sequence_list

smftools 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl

smftools 0.2.4py3-none-any.whl → 0.3.0py3-none-any.whl