PyPI - smftools - Versions diffs - 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl - Mend

smftools 0.2.3py3-none-any.whl → 0.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

smftools/_version.py +1 -1
smftools/cli/helpers.py +48 -0
smftools/cli/hmm_adata.py +168 -145
smftools/cli/load_adata.py +155 -95
smftools/cli/preprocess_adata.py +222 -130
smftools/cli/spatial_adata.py +441 -308
smftools/cli_entry.py +4 -5
smftools/config/conversion.yaml +12 -5
smftools/config/deaminase.yaml +11 -9
smftools/config/default.yaml +123 -19
smftools/config/direct.yaml +3 -0
smftools/config/experiment_config.py +120 -19
smftools/hmm/HMM.py +12 -1
smftools/hmm/__init__.py +0 -6
smftools/hmm/archived/call_hmm_peaks.py +106 -0
smftools/hmm/call_hmm_peaks.py +318 -90
smftools/informatics/bam_functions.py +28 -29
smftools/informatics/h5ad_functions.py +1 -1
smftools/plotting/general_plotting.py +97 -51
smftools/plotting/position_stats.py +3 -3
smftools/preprocessing/__init__.py +2 -4
smftools/preprocessing/append_base_context.py +34 -25
smftools/preprocessing/append_binary_layer_by_base_context.py +2 -2
smftools/preprocessing/binarize_on_Youden.py +10 -8
smftools/preprocessing/calculate_complexity_II.py +1 -1
smftools/preprocessing/calculate_coverage.py +16 -13
smftools/preprocessing/calculate_position_Youden.py +41 -25
smftools/preprocessing/calculate_read_modification_stats.py +1 -1
smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
smftools/preprocessing/filter_reads_on_modification_thresholds.py +1 -1
smftools/preprocessing/flag_duplicate_reads.py +1 -1
smftools/preprocessing/invert_adata.py +1 -1
smftools/preprocessing/load_sample_sheet.py +1 -1
smftools/preprocessing/reindex_references_adata.py +37 -0
smftools/readwrite.py +94 -0
{smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/METADATA +18 -12
{smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/RECORD +46 -43
/smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
/smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
/smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
/smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
/smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
/smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
{smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
{smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/entry_points.txt +0 -0
{smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0

smftools/informatics/bam_functions.py CHANGED Viewed

@@ -70,24 +70,15 @@ def _index_bam_with_pysam(bam_path: Union[str, Path], threads: Optional[int] = N
 def align_and_sort_BAM(fasta,
                        input,
-                       bam_suffix='.bam',
-                       output_directory='aligned_outputs',
-                       make_bigwigs=False,
-                       threads=None,
-                       aligner='minimap2',
-                       aligner_args=['-a', '-x', 'map-ont', '--MD', '-Y', '-y', '-N', '5', '--secondary=no']):
+                       cfg,
+):
     """
     A wrapper for running dorado aligner and samtools functions
     Parameters:
         fasta (str): File path to the reference genome to align to.
         input (str): File path to the basecalled file to align. Works for .bam and .fastq files
-        bam_suffix (str): The suffix to use for the BAM file.
-        output_directory (str): A file path to the directory to output all the analyses.
-        make_bigwigs (bool): Whether to make bigwigs
-        threads (int): Number of additional threads to use
-        aligner (str): Aligner to use. minimap2 and dorado options
-        aligner_args (list): list of optional parameters to use for the alignment
+        cfg: The configuration object
     Returns:
         None
@@ -97,40 +88,48 @@ def align_and_sort_BAM(fasta,
     input_suffix = input.suffix
     input_as_fastq = input.with_name(input.stem + '.fastq')
-    output_path_minus_suffix = output_directory / input.stem
+    output_path_minus_suffix = cfg.output_directory / input.stem
     aligned_BAM = output_path_minus_suffix.with_name(output_path_minus_suffix.stem + "_aligned")
-    aligned_output = aligned_BAM.with_suffix(bam_suffix)
+    aligned_output = aligned_BAM.with_suffix(cfg.bam_suffix)
     aligned_sorted_BAM =aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
-    aligned_sorted_output = aligned_sorted_BAM.with_suffix(bam_suffix)
+    aligned_sorted_output = aligned_sorted_BAM.with_suffix(cfg.bam_suffix)
-    if threads:
-        threads = str(threads)
+    if cfg.threads:
+        threads = str(cfg.threads)
     else:
-        pass
+        threads = None
-    if aligner == 'minimap2':
-        print(f"Converting BAM to FASTQ: {input}")
-        _bam_to_fastq_with_pysam(input, input_as_fastq)
-        print(f"Aligning FASTQ to Reference: {input_as_fastq}")
+    if cfg.aligner == 'minimap2':
+        if not cfg.align_from_bam:
+            print(f"Converting BAM to FASTQ: {input}")
+            _bam_to_fastq_with_pysam(input, input_as_fastq)
+            print(f"Aligning FASTQ to Reference: {input_as_fastq}")
+            mm_input = input_as_fastq
+        else:
+            print(f"Aligning BAM to Reference: {input}")
+            mm_input = input
         if threads:
-            minimap_command = ['minimap2'] + aligner_args + ['-t', threads, str(fasta), str(input_as_fastq)]
+            minimap_command = ['minimap2'] + cfg.aligner_args + ['-t', threads, str(fasta), str(mm_input)]
         else:
-            minimap_command = ['minimap2'] + aligner_args + [str(fasta), str(input_as_fastq)]
+            minimap_command = ['minimap2'] + cfg.aligner_args + [str(fasta), str(mm_input)]
         subprocess.run(minimap_command, stdout=open(aligned_output, "wb"))
-        os.remove(input_as_fastq)
-    elif aligner == 'dorado':
+        if not cfg.align_from_bam:
+            os.remove(input_as_fastq)
+    elif cfg.aligner == 'dorado':
         # Run dorado aligner
         print(f"Aligning BAM to Reference: {input}")
         if threads:
-            alignment_command = ["dorado", "aligner", "-t", threads] + aligner_args + [str(fasta), str(input)]
+            alignment_command = ["dorado", "aligner", "-t", threads] + cfg.aligner_args + [str(fasta), str(input)]
         else:
-            alignment_command = ["dorado", "aligner"] + aligner_args + [str(fasta), str(input)]
+            alignment_command = ["dorado", "aligner"] + cfg.aligner_args + [str(fasta), str(input)]
         subprocess.run(alignment_command, stdout=open(aligned_output, "wb"))
     else:
-        print(f'Aligner not recognized: {aligner}. Choose from minimap2 and dorado')
+        print(f'Aligner not recognized: {cfg.aligner}. Choose from minimap2 and dorado')
         return
     # --- Sort & Index with pysam ---

smftools/informatics/h5ad_functions.py CHANGED Viewed

@@ -75,7 +75,7 @@ def add_read_length_and_mapping_qc(
     adata,
     bam_files: Optional[List[str]] = None,
     read_metrics: Optional[Dict[str, Union[list, tuple]]] = None,
-    uns_flag: str = "read_lenth_and_mapping_qc_performed",
+    uns_flag: str = "add_read_length_and_mapping_qc_performed",
     extract_read_features_from_bam_callable = None,
     bypass: bool = False,
     force_redo: bool = True

smftools/plotting/general_plotting.py CHANGED Viewed

@@ -9,9 +9,62 @@ import os
 import math
 import pandas as pd
-from typing import Optional, Mapping, Sequence, Any, Dict, List
+from typing import Optional, Mapping, Sequence, Any, Dict, List, Tuple
 from pathlib import Path
+def _fixed_tick_positions(n_positions: int, n_ticks: int) -> np.ndarray:
+    """
+    Return indices for ~n_ticks evenly spaced labels across [0, n_positions-1].
+    Always includes 0 and n_positions-1 when possible.
+    """
+    n_ticks = int(max(2, n_ticks))
+    if n_positions <= n_ticks:
+        return np.arange(n_positions)
+    # linspace gives fixed count
+    pos = np.linspace(0, n_positions - 1, n_ticks)
+    return np.unique(np.round(pos).astype(int))
+def _select_labels(subset, sites: np.ndarray, reference: str, index_col_suffix: str | None):
+    """
+    Select tick labels for the heatmap axis.
+    Parameters
+    ----------
+    subset : AnnData view
+        The per-bin subset of the AnnData.
+    sites : np.ndarray[int]
+        Indices of the subset.var positions to annotate.
+    reference : str
+        Reference name (e.g., '6B6_top').
+    index_col_suffix : None or str
+        If None → use subset.var_names
+        Else     → use subset.var[f"{reference}_{index_col_suffix}"]
+    Returns
+    -------
+    np.ndarray[str]
+        The labels to use for tick positions.
+    """
+    if sites.size == 0:
+        return np.array([])
+    # Default behavior: use var_names
+    if index_col_suffix is None:
+        return subset.var_names[sites].astype(str)
+    # Otherwise: use a computed column adata.var[f"{reference}_{suffix}"]
+    colname = f"{reference}_{index_col_suffix}"
+    if colname not in subset.var:
+        raise KeyError(
+            f"index_col_suffix='{index_col_suffix}' requires var column '{colname}', "
+            f"but it is not present in adata.var."
+        )
+    labels = subset.var[colname].astype(str).values
+    return labels[sites]
 def normalized_mean(matrix: np.ndarray) -> np.ndarray:
     mean = np.nanmean(matrix, axis=0)
     denom = (mean.max() - mean.min()) + 1e-9
@@ -266,7 +319,6 @@ def clean_barplot(ax, mean_values, title):
 #                 traceback.print_exc()
 #                 continue
 def combined_hmm_raw_clustermap(
     adata,
     sample_col: str = "Sample_Names",
@@ -276,13 +328,13 @@ def combined_hmm_raw_clustermap(
     layer_gpc: str = "nan0_0minus1",
     layer_cpg: str = "nan0_0minus1",
-    layer_any_c: str = "nan0_0minus1",
+    layer_c: str = "nan0_0minus1",
     layer_a: str = "nan0_0minus1",
     cmap_hmm: str = "tab10",
     cmap_gpc: str = "coolwarm",
     cmap_cpg: str = "viridis",
-    cmap_any_c: str = "coolwarm",
+    cmap_c: str = "coolwarm",
     cmap_a: str = "coolwarm",
     min_quality: int = 20,
@@ -305,15 +357,17 @@ def combined_hmm_raw_clustermap(
     n_xticks_gpc: int = 8,
     n_xticks_cpg: int = 8,
     n_xticks_a: int = 8,
+    index_col_suffix: str | None = None,
 ):
     """
     Makes a multi-panel clustermap per (sample, reference):
-      HMM panel (always) + optional raw panels for any_C, GpC, CpG, and A sites.
+      HMM panel (always) + optional raw panels for C, GpC, CpG, and A sites.
     Panels are added only if the corresponding site mask exists AND has >0 sites.
     sort_by options:
-      'gpc', 'cpg', 'any_c', 'any_a', 'gpc_cpg', 'none', or 'obs:<col>'
+      'gpc', 'cpg', 'c', 'a', 'gpc_cpg', 'none', 'hmm', or 'obs:<col>'
     """
     def pick_xticks(labels: np.ndarray, n_ticks: int):
         if labels.size == 0:
@@ -363,18 +417,19 @@ def combined_hmm_raw_clustermap(
                             return np.where(subset.var[k].values)[0]
                     return np.array([], dtype=int)
-                gpc_sites = _sites(f"{ref}_GpC_site")
-                cpg_sites = _sites(f"{ref}_CpG_site")
+                gpc_sites   = _sites(f"{ref}_GpC_site")
+                cpg_sites   = _sites(f"{ref}_CpG_site")
                 any_c_sites = _sites(f"{ref}_any_C_site", f"{ref}_C_site")
                 any_a_sites = _sites(f"{ref}_A_site", f"{ref}_any_A_site")
-                def _labels(sites):
-                    return subset.var_names[sites].astype(int) if sites.size else np.array([])
-                gpc_labels = _labels(gpc_sites)
-                cpg_labels = _labels(cpg_sites)
-                any_c_labels = _labels(any_c_sites)
-                any_a_labels = _labels(any_a_sites)
+                # ---- labels via _select_labels ----
+                # HMM uses *all* columns
+                hmm_sites   = np.arange(subset.n_vars, dtype=int)
+                hmm_labels  = _select_labels(subset, hmm_sites,   ref, index_col_suffix)
+                gpc_labels  = _select_labels(subset, gpc_sites,   ref, index_col_suffix)
+                cpg_labels  = _select_labels(subset, cpg_sites,   ref, index_col_suffix)
+                any_c_labels = _select_labels(subset, any_c_sites, ref, index_col_suffix)
+                any_a_labels = _select_labels(subset, any_a_sites, ref, index_col_suffix)
                 # storage
                 stacked_hmm = []
@@ -411,17 +466,21 @@ def combined_hmm_raw_clustermap(
                         linkage = sch.linkage(sb[:, cpg_sites].layers[layer_cpg], method="ward")
                         order = sch.leaves_list(linkage)
-                    elif sort_by == "any_c" and any_c_sites.size:
-                        linkage = sch.linkage(sb[:, any_c_sites].layers[layer_any_c], method="ward")
+                    elif sort_by == "c" and any_c_sites.size:
+                        linkage = sch.linkage(sb[:, any_c_sites].layers[layer_c], method="ward")
                         order = sch.leaves_list(linkage)
-                    elif sort_by == "any_a" and any_a_sites.size:
+                    elif sort_by == "a" and any_a_sites.size:
                         linkage = sch.linkage(sb[:, any_a_sites].layers[layer_a], method="ward")
                         order = sch.leaves_list(linkage)
                     elif sort_by == "gpc_cpg" and gpc_sites.size and cpg_sites.size:
                         linkage = sch.linkage(sb.layers[layer_gpc], method="ward")
                         order = sch.leaves_list(linkage)
+                    elif sort_by == "hmm" and hmm_sites.size:
+                        linkage = sch.linkage(sb[:, hmm_sites].layers[hmm_feature_layer], method="ward")
+                        order = sch.leaves_list(linkage)
                     else:
                         order = np.arange(n)
@@ -431,7 +490,7 @@ def combined_hmm_raw_clustermap(
                     # ---- collect matrices ----
                     stacked_hmm.append(sb.layers[hmm_feature_layer])
                     if any_c_sites.size:
-                        stacked_any_c.append(sb[:, any_c_sites].layers[layer_any_c])
+                        stacked_any_c.append(sb[:, any_c_sites].layers[layer_c])
                     if gpc_sites.size:
                         stacked_gpc.append(sb[:, gpc_sites].layers[layer_gpc])
                     if cpg_sites.size:
@@ -449,12 +508,12 @@ def combined_hmm_raw_clustermap(
                 mean_hmm = normalized_mean(hmm_matrix) if normalize_hmm else np.nanmean(hmm_matrix, axis=0)
                 panels = [
-                    ("HMM", hmm_matrix, subset.var_names.astype(int), cmap_hmm, mean_hmm, n_xticks_hmm),
+                    (f"HMM - {hmm_feature_layer}", hmm_matrix, hmm_labels, cmap_hmm, mean_hmm, n_xticks_hmm),
                 ]
                 if stacked_any_c:
                     m = np.vstack(stacked_any_c)
-                    panels.append(("any_C", m, any_c_labels, cmap_any_c, methylation_fraction(m), n_xticks_any_c))
+                    panels.append(("C", m, any_c_labels, cmap_c, methylation_fraction(m), n_xticks_any_c))
                 if stacked_gpc:
                     m = np.vstack(stacked_gpc)
@@ -777,29 +836,16 @@ def combined_hmm_raw_clustermap(
 #                 traceback.print_exc()
 #                 continue
-def _fixed_tick_positions(n_positions: int, n_ticks: int) -> np.ndarray:
-    """
-    Return indices for ~n_ticks evenly spaced labels across [0, n_positions-1].
-    Always includes 0 and n_positions-1 when possible.
-    """
-    n_ticks = int(max(2, n_ticks))
-    if n_positions <= n_ticks:
-        return np.arange(n_positions)
-    # linspace gives fixed count
-    pos = np.linspace(0, n_positions - 1, n_ticks)
-    return np.unique(np.round(pos).astype(int))
 def combined_raw_clustermap(
     adata,
     sample_col: str = "Sample_Names",
     reference_col: str = "Reference_strand",
     mod_target_bases: Sequence[str] = ("GpC", "CpG"),
-    layer_any_c: str = "nan0_0minus1",
+    layer_c: str = "nan0_0minus1",
     layer_gpc: str = "nan0_0minus1",
     layer_cpg: str = "nan0_0minus1",
     layer_a: str = "nan0_0minus1",
-    cmap_any_c: str = "coolwarm",
+    cmap_c: str = "coolwarm",
     cmap_gpc: str = "coolwarm",
     cmap_cpg: str = "viridis",
     cmap_a: str = "coolwarm",
@@ -809,20 +855,20 @@ def combined_raw_clustermap(
     min_position_valid_fraction: float = 0.5,
     sample_mapping: Optional[Mapping[str, str]] = None,
     save_path: str | Path | None = None,
-    sort_by: str = "gpc",  # 'gpc','cpg','any_c','gpc_cpg','any_a','none','obs:<col>'
+    sort_by: str = "gpc",  # 'gpc','cpg','c','gpc_cpg','a','none','obs:<col>'
     bins: Optional[Dict[str, Any]] = None,
     deaminase: bool = False,
     min_signal: float = 0,
-    # NEW tick controls
     n_xticks_any_c: int = 10,
     n_xticks_gpc: int = 10,
     n_xticks_cpg: int = 10,
     n_xticks_any_a: int = 10,
     xtick_rotation: int = 90,
     xtick_fontsize: int = 9,
+    index_col_suffix: str | None = None,
 ):
     """
-    Plot stacked heatmaps + per-position mean barplots for any_C, GpC, CpG, and optional A.
+    Plot stacked heatmaps + per-position mean barplots for C, GpC, CpG, and optional A.
     Key fixes vs old version:
       - order computed ONCE per bin, applied to all matrices
@@ -898,14 +944,14 @@ def combined_raw_clustermap(
                     num_any_c, num_gpc, num_cpg = len(any_c_sites), len(gpc_sites), len(cpg_sites)
-                    any_c_labels = subset.var_names[any_c_sites].astype(str)
-                    gpc_labels   = subset.var_names[gpc_sites].astype(str)
-                    cpg_labels   = subset.var_names[cpg_sites].astype(str)
+                    any_c_labels = _select_labels(subset, any_c_sites, ref, index_col_suffix)
+                    gpc_labels   = _select_labels(subset, gpc_sites, ref, index_col_suffix)
+                    cpg_labels   = _select_labels(subset, cpg_sites, ref, index_col_suffix)
                 if include_any_a:
                     any_a_sites = np.where(subset.var.get(f"{ref}_A_site", False).values)[0]
                     num_any_a = len(any_a_sites)
-                    any_a_labels = subset.var_names[any_a_sites].astype(str)
+                    any_a_labels = _select_labels(subset, any_a_sites, ref, index_col_suffix)
                 stacked_any_c, stacked_gpc, stacked_cpg, stacked_any_a = [], [], [], []
                 row_labels, bin_labels, bin_boundaries = [], [], []
@@ -939,15 +985,15 @@ def combined_raw_clustermap(
                         linkage = sch.linkage(subset_bin[:, cpg_sites].layers[layer_cpg], method="ward")
                         order = sch.leaves_list(linkage)
-                    elif sort_by == "any_c" and num_any_c > 0:
-                        linkage = sch.linkage(subset_bin[:, any_c_sites].layers[layer_any_c], method="ward")
+                    elif sort_by == "c" and num_any_c > 0:
+                        linkage = sch.linkage(subset_bin[:, any_c_sites].layers[layer_c], method="ward")
                         order = sch.leaves_list(linkage)
                     elif sort_by == "gpc_cpg":
                         linkage = sch.linkage(subset_bin.layers[layer_gpc], method="ward")
                         order = sch.leaves_list(linkage)
-                    elif sort_by == "any_a" and num_any_a > 0:
+                    elif sort_by == "a" and num_any_a > 0:
                         linkage = sch.linkage(subset_bin[:, any_a_sites].layers[layer_a], method="ward")
                         order = sch.leaves_list(linkage)
@@ -961,7 +1007,7 @@ def combined_raw_clustermap(
                     # stack consistently
                     if include_any_c and num_any_c > 0:
-                        stacked_any_c.append(subset_bin[:, any_c_sites].layers[layer_any_c])
+                        stacked_any_c.append(subset_bin[:, any_c_sites].layers[layer_c])
                     if include_any_c and num_gpc > 0:
                         stacked_gpc.append(subset_bin[:, gpc_sites].layers[layer_gpc])
                     if include_any_c and num_cpg > 0:
@@ -990,11 +1036,11 @@ def combined_raw_clustermap(
                     if any_c_matrix.size:
                         blocks.append(dict(
-                            name="any_c",
+                            name="c",
                             matrix=any_c_matrix,
                             mean=mean_any_c,
                             labels=any_c_labels,
-                            cmap=cmap_any_c,
+                            cmap=cmap_c,
                             n_xticks=n_xticks_any_c,
                             title="any C site Modification Signal"
                         ))
@@ -1024,7 +1070,7 @@ def combined_raw_clustermap(
                     mean_any_a = methylation_fraction(any_a_matrix) if any_a_matrix.size else None
                     if any_a_matrix.size:
                         blocks.append(dict(
-                            name="any_a",
+                            name="a",
                             matrix=any_a_matrix,
                             mean=mean_any_a,
                             labels=any_a_labels,
@@ -1141,7 +1187,7 @@ def plot_hmm_layers_rolling_by_sample_ref(
     output_dir: Optional[str] = None,
     save: bool = True,
     show_raw: bool = False,
-    cmap: str = "tab10",
+    cmap: str = "tab20",
     use_var_coords: bool = True,
 ):
     """

smftools/plotting/position_stats.py CHANGED Viewed

@@ -90,7 +90,7 @@ def plot_volcano_relative_risk(
                 safe_name = f"{ref}_{group_label}".replace("=", "").replace("__", "_").replace(",", "_").replace(" ", "_")
                 out_file = os.path.join(save_path, f"{safe_name}.png")
                 plt.savefig(out_file, dpi=300)
-                print(f"📁 Saved: {out_file}")
+                print(f"Saved: {out_file}")
             plt.show()
@@ -449,7 +449,7 @@ def plot_positionwise_matrix_grid(
             os.makedirs(save_path, exist_ok=True)
             fname = outer_label.replace("_", "").replace("=", "") + ".png"
             plt.savefig(os.path.join(save_path, fname), dpi=300, bbox_inches='tight')
-            print(f"✅ Saved {fname}")
+            print(f"Saved {fname}")
         plt.close(fig)
@@ -459,4 +459,4 @@ def plot_positionwise_matrix_grid(
         for outer_label in parsed['outer'].unique():
             plot_one_grid(outer_label)
-    print("✅ Finished plotting all grids.")
+    print("Finished plotting all grids.")

smftools/preprocessing/__init__.py CHANGED Viewed

@@ -1,9 +1,7 @@
-from .add_read_length_and_mapping_qc import add_read_length_and_mapping_qc
 from .append_base_context import append_base_context
 from .append_binary_layer_by_base_context import append_binary_layer_by_base_context
 from .binarize_on_Youden import binarize_on_Youden
 from .binarize import binarize_adata
-from .calculate_complexity import calculate_complexity
 from .calculate_complexity_II import calculate_complexity_II
 from .calculate_read_modification_stats import calculate_read_modification_stats
 from .calculate_coverage import calculate_coverage
@@ -16,15 +14,15 @@ from .filter_reads_on_length_quality_mapping import filter_reads_on_length_quali
 from .invert_adata import invert_adata
 from .load_sample_sheet import load_sample_sheet
 from .flag_duplicate_reads import flag_duplicate_reads
+from .reindex_references_adata import reindex_references_adata
 from .subsample_adata import subsample_adata
 __all__ = [
-    "add_read_length_and_mapping_qc",
     "append_base_context",
     "append_binary_layer_by_base_context",
     "binarize_on_Youden",
     "binarize_adata",
-    "calculate_complexity",
+    "calculate_complexity_II",
     "calculate_read_modification_stats",
     "calculate_coverage",
     "calculate_position_Youden",

smftools/preprocessing/append_base_context.py CHANGED Viewed

@@ -1,18 +1,19 @@
 def append_base_context(adata,
-                        obs_column='Reference_strand',
+                        ref_column='Reference_strand',
                         use_consensus=False,
                         native=False,
                         mod_target_bases=['GpC', 'CpG'],
                         bypass=False,
                         force_redo=False,
-                        uns_flag='base_context_added'
+                        uns_flag='append_base_context_performed'
 ):
     """
     Adds nucleobase context to the position within the given category. When use_consensus is True, it uses the consensus sequence, otherwise it defaults to the FASTA sequence.
+    This needs to be performed prior to AnnData inversion step.
     Parameters:
         adata (AnnData): The input adata object.
-        obs_column (str): The observation column in which to stratify on. Default is 'Reference_strand', which should not be changed for most purposes.
+        ref_column (str): The observation column in which to stratify on. Default is 'Reference_strand', which should not be changed for most purposes.
         use_consensus (bool): A truth statement indicating whether to use the consensus sequence from the reads mapped to the reference. If False, the reference FASTA is used instead.
         native (bool): If False, perform conversion SMF assumptions. If True, perform native SMF assumptions
         mod_target_bases (list): Base contexts that may be modified.
@@ -30,7 +31,7 @@ def append_base_context(adata,
         return
     print('Adding base context based on reference FASTA sequence for sample')
-    categories = adata.obs[obs_column].cat.categories
+    references = adata.obs[ref_column].cat.categories
     site_types = []
     if any(base in mod_target_bases for base in ['GpC', 'CpG', 'C']):
@@ -39,59 +40,60 @@ def append_base_context(adata,
     if 'A' in mod_target_bases:
         site_types += ['A_site']
-    for cat in categories:
+    for ref in references:
         # Assess if the strand is the top or bottom strand converted
-        if 'top' in cat:
+        if 'top' in ref:
             strand = 'top'
-        elif 'bottom' in cat:
+        elif 'bottom' in ref:
             strand = 'bottom'
         if native:
-            basename = cat.split(f"_{strand}")[0]
+            basename = ref.split(f"_{strand}")[0]
             if use_consensus:
                 sequence = adata.uns[f'{basename}_consensus_sequence']
             else:
                 # This sequence is the unconverted FASTA sequence of the original input FASTA for the locus
                 sequence = adata.uns[f'{basename}_FASTA_sequence']
         else:
-            basename = cat.split(f"_{strand}")[0]
+            basename = ref.split(f"_{strand}")[0]
             if use_consensus:
                 sequence = adata.uns[f'{basename}_consensus_sequence']
             else:
                 # This sequence is the unconverted FASTA sequence of the original input FASTA for the locus
                 sequence = adata.uns[f'{basename}_FASTA_sequence']
         # Init a dict keyed by reference site type that points to a bool of whether the position is that site type.
         boolean_dict = {}
         for site_type in site_types:
-            boolean_dict[f'{cat}_{site_type}'] = np.full(len(sequence), False, dtype=bool)
+            boolean_dict[f'{ref}_{site_type}'] = np.full(len(sequence), False, dtype=bool)
         if any(base in mod_target_bases for base in ['GpC', 'CpG', 'C']):
             if strand == 'top':
                 # Iterate through the sequence and apply the criteria
                 for i in range(1, len(sequence) - 1):
                     if sequence[i] == 'C':
-                        boolean_dict[f'{cat}_C_site'][i] = True
+                        boolean_dict[f'{ref}_C_site'][i] = True
                         if sequence[i - 1] == 'G' and sequence[i + 1] != 'G':
-                            boolean_dict[f'{cat}_GpC_site'][i] = True
+                            boolean_dict[f'{ref}_GpC_site'][i] = True
                         elif sequence[i - 1] == 'G' and sequence[i + 1] == 'G':
-                            boolean_dict[f'{cat}_ambiguous_GpC_CpG_site'][i] = True
+                            boolean_dict[f'{ref}_ambiguous_GpC_CpG_site'][i] = True
                         elif sequence[i - 1] != 'G' and sequence[i + 1] == 'G':
-                            boolean_dict[f'{cat}_CpG_site'][i] = True
+                            boolean_dict[f'{ref}_CpG_site'][i] = True
                         elif sequence[i - 1] != 'G' and sequence[i + 1] != 'G':
-                            boolean_dict[f'{cat}_other_C_site'][i] = True
+                            boolean_dict[f'{ref}_other_C_site'][i] = True
             elif strand == 'bottom':
                 # Iterate through the sequence and apply the criteria
                 for i in range(1, len(sequence) - 1):
                     if sequence[i] == 'G':
-                        boolean_dict[f'{cat}_C_site'][i] = True
+                        boolean_dict[f'{ref}_C_site'][i] = True
                         if sequence[i + 1] == 'C' and sequence[i - 1] != 'C':
-                            boolean_dict[f'{cat}_GpC_site'][i] = True
+                            boolean_dict[f'{ref}_GpC_site'][i] = True
                         elif sequence[i - 1] == 'C' and sequence[i + 1] == 'C':
-                            boolean_dict[f'{cat}_ambiguous_GpC_CpG_site'][i] = True
+                            boolean_dict[f'{ref}_ambiguous_GpC_CpG_site'][i] = True
                         elif sequence[i - 1] == 'C' and sequence[i + 1] != 'C':
-                            boolean_dict[f'{cat}_CpG_site'][i] = True
+                            boolean_dict[f'{ref}_CpG_site'][i] = True
                         elif sequence[i - 1] != 'C' and sequence[i + 1] != 'C':
-                            boolean_dict[f'{cat}_other_C_site'][i] = True
+                            boolean_dict[f'{ref}_other_C_site'][i] = True
             else:
                 print('Error: top or bottom strand of conversion could not be determined. Ensure this value is in the Reference name.')
@@ -100,21 +102,28 @@ def append_base_context(adata,
                 # Iterate through the sequence and apply the criteria
                 for i in range(1, len(sequence) - 1):
                     if sequence[i] == 'A':
-                        boolean_dict[f'{cat}_A_site'][i] = True
+                        boolean_dict[f'{ref}_A_site'][i] = True
             elif strand == 'bottom':
                 # Iterate through the sequence and apply the criteria
                 for i in range(1, len(sequence) - 1):
                     if sequence[i] == 'T':
-                        boolean_dict[f'{cat}_A_site'][i] = True
+                        boolean_dict[f'{ref}_A_site'][i] = True
             else:
                 print('Error: top or bottom strand of conversion could not be determined. Ensure this value is in the Reference name.')
         for site_type in site_types:
-            adata.var[f'{cat}_{site_type}'] = boolean_dict[f'{cat}_{site_type}'].astype(bool)
+            # Site context annotations for each reference
+            adata.var[f'{ref}_{site_type}'] = boolean_dict[f'{ref}_{site_type}'].astype(bool)
+            # Restrict the site type labels to only be in positions that occur at a high enough frequency in the dataset
+            if adata.uns["calculate_coverage_performed"] == True:
+                adata.var[f'{ref}_{site_type}'] = (adata.var[f'{ref}_{site_type}']) & (adata.var[f'position_in_{ref}'])
+            else:
+                pass
             if native:
-                adata.obsm[f'{cat}_{site_type}'] = adata[:, adata.var[f'{cat}_{site_type}'] == True].layers['binarized_methylation']
+                adata.obsm[f'{ref}_{site_type}'] = adata[:, adata.var[f'{ref}_{site_type}'] == True].layers['binarized_methylation']
             else:
-                adata.obsm[f'{cat}_{site_type}'] = adata[:, adata.var[f'{cat}_{site_type}'] == True].X
+                adata.obsm[f'{ref}_{site_type}'] = adata[:, adata.var[f'{ref}_{site_type}'] == True].X
     # mark as done
     adata.uns[uns_flag] = True

smftools/preprocessing/append_binary_layer_by_base_context.py CHANGED Viewed

@@ -6,7 +6,7 @@ def append_binary_layer_by_base_context(
     reference_column: str,
     smf_modality: str = "conversion",
     verbose: bool = True,
-    uns_flag: str = "binary_layers_by_base_context_added",
+    uns_flag: str = "append_binary_layer_by_base_context_performed",
     bypass: bool = False,
     force_redo: bool = False
 ):
@@ -27,7 +27,7 @@ def append_binary_layer_by_base_context(
     # Only run if not already performed
     already = bool(adata.uns.get(uns_flag, False))
-    if (already and not force_redo) or bypass or ("base_context_added" not in adata.uns):
+    if (already and not force_redo) or bypass or ("append_base_context_performed" not in adata.uns):
         # QC already performed; nothing to do
         return adata

smftools 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl

smftools 0.2.3py3-none-any.whl → 0.2.4py3-none-any.whl