PyPI - smftools - Versions diffs - 0.1.3__py3-none-any.whl → 0.1.7__py3-none-any.whl - Mend

smftools 0.1.3py3-none-any.whl → 0.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (109) hide show

smftools/__init__.py +5 -1
smftools/_version.py +1 -1
smftools/informatics/__init__.py +2 -0
smftools/informatics/archived/print_bam_query_seq.py +29 -0
smftools/informatics/basecall_pod5s.py +80 -0
smftools/informatics/conversion_smf.py +63 -10
smftools/informatics/direct_smf.py +66 -18
smftools/informatics/helpers/LoadExperimentConfig.py +1 -0
smftools/informatics/helpers/__init__.py +16 -2
smftools/informatics/helpers/align_and_sort_BAM.py +27 -16
smftools/informatics/helpers/aligned_BAM_to_bed.py +49 -48
smftools/informatics/helpers/bam_qc.py +66 -0
smftools/informatics/helpers/binarize_converted_base_identities.py +69 -21
smftools/informatics/helpers/canoncall.py +12 -3
smftools/informatics/helpers/concatenate_fastqs_to_bam.py +5 -4
smftools/informatics/helpers/converted_BAM_to_adata.py +34 -22
smftools/informatics/helpers/converted_BAM_to_adata_II.py +369 -0
smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
smftools/informatics/helpers/extract_base_identities.py +33 -46
smftools/informatics/helpers/extract_mods.py +55 -23
smftools/informatics/helpers/extract_read_features_from_bam.py +31 -0
smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
smftools/informatics/helpers/find_conversion_sites.py +33 -44
smftools/informatics/helpers/generate_converted_FASTA.py +87 -86
smftools/informatics/helpers/modcall.py +13 -5
smftools/informatics/helpers/modkit_extract_to_adata.py +762 -396
smftools/informatics/helpers/ohe_batching.py +65 -41
smftools/informatics/helpers/ohe_layers_decode.py +32 -0
smftools/informatics/helpers/one_hot_decode.py +27 -0
smftools/informatics/helpers/one_hot_encode.py +45 -9
smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +1 -0
smftools/informatics/helpers/run_multiqc.py +28 -0
smftools/informatics/helpers/split_and_index_BAM.py +3 -8
smftools/informatics/load_adata.py +58 -3
smftools/plotting/__init__.py +15 -0
smftools/plotting/classifiers.py +355 -0
smftools/plotting/general_plotting.py +205 -0
smftools/plotting/position_stats.py +462 -0
smftools/preprocessing/__init__.py +6 -7
smftools/preprocessing/append_C_context.py +22 -9
smftools/preprocessing/{mark_duplicates.py → archives/mark_duplicates.py} +38 -26
smftools/preprocessing/binarize_on_Youden.py +35 -32
smftools/preprocessing/binary_layers_to_ohe.py +13 -3
smftools/preprocessing/calculate_complexity.py +3 -2
smftools/preprocessing/calculate_converted_read_methylation_stats.py +44 -46
smftools/preprocessing/calculate_coverage.py +26 -25
smftools/preprocessing/calculate_pairwise_differences.py +49 -0
smftools/preprocessing/calculate_position_Youden.py +18 -7
smftools/preprocessing/calculate_read_length_stats.py +39 -46
smftools/preprocessing/clean_NaN.py +33 -25
smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
smftools/preprocessing/filter_converted_reads_on_methylation.py +20 -5
smftools/preprocessing/filter_reads_on_length.py +14 -4
smftools/preprocessing/flag_duplicate_reads.py +149 -0
smftools/preprocessing/invert_adata.py +18 -11
smftools/preprocessing/load_sample_sheet.py +30 -16
smftools/preprocessing/recipes.py +22 -20
smftools/preprocessing/subsample_adata.py +58 -0
smftools/readwrite.py +105 -13
smftools/tools/__init__.py +49 -0
smftools/tools/apply_hmm.py +202 -0
smftools/tools/apply_hmm_batched.py +241 -0
smftools/tools/archived/classify_methylated_features.py +66 -0
smftools/tools/archived/classify_non_methylated_features.py +75 -0
smftools/tools/archived/subset_adata_v1.py +32 -0
smftools/tools/archived/subset_adata_v2.py +46 -0
smftools/tools/calculate_distances.py +18 -0
smftools/tools/calculate_umap.py +62 -0
smftools/tools/call_hmm_peaks.py +105 -0
smftools/tools/classifiers.py +787 -0
smftools/tools/cluster_adata_on_methylation.py +105 -0
smftools/tools/data/__init__.py +2 -0
smftools/tools/data/anndata_data_module.py +90 -0
smftools/tools/data/preprocessing.py +6 -0
smftools/tools/display_hmm.py +18 -0
smftools/tools/general_tools.py +69 -0
smftools/tools/hmm_readwrite.py +16 -0
smftools/tools/inference/__init__.py +1 -0
smftools/tools/inference/lightning_inference.py +41 -0
smftools/tools/models/__init__.py +9 -0
smftools/tools/models/base.py +14 -0
smftools/tools/models/cnn.py +34 -0
smftools/tools/models/lightning_base.py +41 -0
smftools/tools/models/mlp.py +17 -0
smftools/tools/models/positional.py +17 -0
smftools/tools/models/rnn.py +16 -0
smftools/tools/models/sklearn_models.py +40 -0
smftools/tools/models/transformer.py +133 -0
smftools/tools/models/wrappers.py +20 -0
smftools/tools/nucleosome_hmm_refinement.py +104 -0
smftools/tools/position_stats.py +239 -0
smftools/tools/read_stats.py +70 -0
smftools/tools/subset_adata.py +19 -23
smftools/tools/train_hmm.py +78 -0
smftools/tools/training/__init__.py +1 -0
smftools/tools/training/train_lightning_model.py +47 -0
smftools/tools/utils/__init__.py +2 -0
smftools/tools/utils/device.py +10 -0
smftools/tools/utils/grl.py +14 -0
{smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/METADATA +47 -11
smftools-0.1.7.dist-info/RECORD +136 -0
smftools/tools/apply_HMM.py +0 -1
smftools/tools/read_HMM.py +0 -1
smftools/tools/train_HMM.py +0 -43
smftools-0.1.3.dist-info/RECORD +0 -84
/smftools/preprocessing/{remove_duplicates.py → archives/remove_duplicates.py} +0 -0
/smftools/tools/{cluster.py → evaluation/__init__.py} +0 -0
{smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/WHEEL +0 -0
{smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/licenses/LICENSE +0 -0

smftools/plotting/position_stats.py ADDED Viewed

@@ -0,0 +1,462 @@
+def plot_volcano_relative_risk(
+    results_dict,
+    save_path=None,
+    highlight_regions=None,  # List of (start, end) tuples
+    highlight_color="lightgray",
+    highlight_alpha=0.3,
+    xlim=None,
+    ylim=None,
+):
+    """
+    Plot volcano-style log2(Relative Risk) vs Genomic Position for each group within each reference.
+    Parameters:
+        results_dict (dict): Output from calculate_relative_risk_by_group.
+                             Format: dict[ref][group_label] = (results_df, sig_df)
+        save_path (str): Directory to save plots.
+        highlight_regions (list): List of (start, end) tuples for shaded regions.
+        highlight_color (str): Color for highlighted regions.
+        highlight_alpha (float): Alpha for highlighted region.
+        xlim (tuple): Optional x-axis limit.
+        ylim (tuple): Optional y-axis limit.
+    """
+    import matplotlib.pyplot as plt
+    import numpy as np
+    import os
+    for ref, group_results in results_dict.items():
+        for group_label, (results_df, _) in group_results.items():
+            if results_df.empty:
+                print(f"Skipping empty results for {ref} / {group_label}")
+                continue
+            # Split by site type
+            gpc_df = results_df[results_df['GpC_Site']]
+            cpg_df = results_df[results_df['CpG_Site']]
+            fig, ax = plt.subplots(figsize=(12, 6))
+            # Highlight regions
+            if highlight_regions:
+                for start, end in highlight_regions:
+                    ax.axvspan(start, end, color=highlight_color, alpha=highlight_alpha)
+            # GpC as circles
+            sc1 = ax.scatter(
+                gpc_df['Genomic_Position'],
+                gpc_df['log2_Relative_Risk'],
+                c=gpc_df['-log10_Adj_P'],
+                cmap='coolwarm',
+                edgecolor='k',
+                s=40,
+                marker='o',
+                label='GpC'
+            )
+            # CpG as stars
+            sc2 = ax.scatter(
+                cpg_df['Genomic_Position'],
+                cpg_df['log2_Relative_Risk'],
+                c=cpg_df['-log10_Adj_P'],
+                cmap='coolwarm',
+                edgecolor='k',
+                s=60,
+                marker='*',
+                label='CpG'
+            )
+            ax.axhline(y=0, color='gray', linestyle='--')
+            ax.set_xlabel("Genomic Position")
+            ax.set_ylabel("log2(Relative Risk)")
+            ax.set_title(f"{ref} / {group_label} — Relative Risk vs Genomic Position")
+            if xlim:
+                ax.set_xlim(xlim)
+            if ylim:
+                ax.set_ylim(ylim)
+            ax.spines['top'].set_visible(False)
+            ax.spines['right'].set_visible(False)
+            cbar = plt.colorbar(sc1, ax=ax)
+            cbar.set_label("-log10(Adjusted P-Value)")
+            ax.legend()
+            plt.tight_layout()
+            # Save if requested
+            if save_path:
+                os.makedirs(save_path, exist_ok=True)
+                safe_name = f"{ref}_{group_label}".replace("=", "").replace("__", "_").replace(",", "_").replace(" ", "_")
+                out_file = os.path.join(save_path, f"{safe_name}.png")
+                plt.savefig(out_file, dpi=300)
+                print(f"📁 Saved: {out_file}")
+            plt.show()
+def plot_bar_relative_risk(
+    results_dict,
+    sort_by_position=True,
+    xlim=None,
+    ylim=None,
+    save_path=None,
+    highlight_regions=None,  # List of (start, end) tuples
+    highlight_color="lightgray",
+    highlight_alpha=0.3
+):
+    """
+    Plot log2(Relative Risk) as a bar plot across genomic positions for each group within each reference.
+    Parameters:
+        results_dict (dict): Output from calculate_relative_risk_by_group.
+        sort_by_position (bool): Whether to sort bars left-to-right by genomic coordinate.
+        xlim, ylim (tuple): Axis limits.
+        save_path (str or None): Directory to save plots.
+        highlight_regions (list of tuple): List of (start, end) genomic regions to shade.
+        highlight_color (str): Color of shaded region.
+        highlight_alpha (float): Transparency of shaded region.
+    """
+    import matplotlib.pyplot as plt
+    import numpy as np
+    import os
+    for ref, group_data in results_dict.items():
+        for group_label, (df, _) in group_data.items():
+            if df.empty:
+                print(f"Skipping empty result for {ref} / {group_label}")
+                continue
+            df = df.copy()
+            df['Genomic_Position'] = df['Genomic_Position'].astype(int)
+            if sort_by_position:
+                df = df.sort_values('Genomic_Position')
+            gpc_mask = df['GpC_Site'] & ~df['CpG_Site']
+            cpg_mask = df['CpG_Site'] & ~df['GpC_Site']
+            both_mask = df['GpC_Site'] & df['CpG_Site']
+            fig, ax = plt.subplots(figsize=(14, 6))
+            # Optional shaded regions
+            if highlight_regions:
+                for start, end in highlight_regions:
+                    ax.axvspan(start, end, color=highlight_color, alpha=highlight_alpha)
+            # Bar plots
+            ax.bar(
+                df['Genomic_Position'][gpc_mask],
+                df['log2_Relative_Risk'][gpc_mask],
+                width=10,
+                color='steelblue',
+                label='GpC Site',
+                edgecolor='black'
+            )
+            ax.bar(
+                df['Genomic_Position'][cpg_mask],
+                df['log2_Relative_Risk'][cpg_mask],
+                width=10,
+                color='darkorange',
+                label='CpG Site',
+                edgecolor='black'
+            )
+            if both_mask.any():
+                ax.bar(
+                    df['Genomic_Position'][both_mask],
+                    df['log2_Relative_Risk'][both_mask],
+                    width=10,
+                    color='purple',
+                    label='GpC + CpG',
+                    edgecolor='black'
+                )
+            ax.axhline(y=0, color='gray', linestyle='--')
+            ax.set_xlabel('Genomic Position')
+            ax.set_ylabel('log2(Relative Risk)')
+            ax.set_title(f"{ref} — {group_label}")
+            ax.legend()
+            if xlim:
+                ax.set_xlim(xlim)
+            if ylim:
+                ax.set_ylim(ylim)
+            ax.spines['top'].set_visible(False)
+            ax.spines['right'].set_visible(False)
+            plt.tight_layout()
+            if save_path:
+                os.makedirs(save_path, exist_ok=True)
+                safe_name = f"{ref}_{group_label}".replace("=", "").replace("__", "_").replace(",", "_")
+                out_file = os.path.join(save_path, f"{safe_name}.png")
+                plt.savefig(out_file, dpi=300)
+                print(f"📁 Saved: {out_file}")
+            plt.show()
+def plot_positionwise_matrix(
+    adata,
+    key="positionwise_result",
+    log_transform=False,
+    log_base="log1p",  # or 'log2', or None
+    triangle="full",
+    cmap="vlag",
+    figsize=(12, 10),  # Taller to accommodate line plot below
+    vmin=None,
+    vmax=None,
+    xtick_step=10,
+    ytick_step=10,
+    save_path=None,
+    highlight_position=None,         # Can be a single int/float or list of them
+    highlight_axis="row",            # "row" or "column"
+    annotate_points=False             # ✅ New option
+):
+    """
+    Plots positionwise matrices stored in adata.uns[key], with an optional line plot
+    for specified row(s) or column(s), and highlights them on the heatmap.
+    """
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+    import numpy as np
+    import pandas as pd
+    import os
+    def find_closest_index(index, target):
+        index_vals = pd.to_numeric(index, errors="coerce")
+        target_val = pd.to_numeric([target], errors="coerce")[0]
+        diffs = pd.Series(np.abs(index_vals - target_val), index=index)
+        return diffs.idxmin()
+    # Ensure highlight_position is a list
+    if highlight_position is not None and not isinstance(highlight_position, (list, tuple, np.ndarray)):
+        highlight_position = [highlight_position]
+    for group, mat_df in adata.uns[key].items():
+        mat = mat_df.copy()
+        if log_transform:
+            with np.errstate(divide='ignore', invalid='ignore'):
+                if log_base == "log1p":
+                    mat = np.log1p(mat)
+                elif log_base == "log2":
+                    mat = np.log2(mat.replace(0, np.nanmin(mat[mat > 0]) * 0.1))
+                mat.replace([np.inf, -np.inf], np.nan, inplace=True)
+        # Set color limits for log2 to be centered around 0
+        if log_base == "log2" and log_transform and (vmin is None or vmax is None):
+            abs_max = np.nanmax(np.abs(mat.values))
+            vmin = -abs_max if vmin is None else vmin
+            vmax = abs_max if vmax is None else vmax
+        # Create mask for triangle
+        mask = None
+        if triangle == "lower":
+            mask = np.triu(np.ones_like(mat, dtype=bool), k=1)
+        elif triangle == "upper":
+            mask = np.tril(np.ones_like(mat, dtype=bool), k=-1)
+        xticks = mat.columns.astype(int)
+        yticks = mat.index.astype(int)
+        # 👉 Make taller figure: heatmap on top, line plot below
+        fig, axs = plt.subplots(2, 1, figsize=figsize, height_ratios=[3, 1.5])
+        heat_ax, line_ax = axs
+        # Heatmap
+        sns.heatmap(
+            mat,
+            mask=mask,
+            cmap=cmap,
+            xticklabels=xticks,
+            yticklabels=yticks,
+            square=True,
+            vmin=vmin,
+            vmax=vmax,
+            cbar_kws={"label": f"{key} ({log_base})" if log_transform else key},
+            ax=heat_ax
+        )
+        heat_ax.set_title(f"{key} — {group}", pad=20)
+        heat_ax.set_xticks(np.arange(0, len(xticks), xtick_step))
+        heat_ax.set_xticklabels(xticks[::xtick_step], rotation=90)
+        heat_ax.set_yticks(np.arange(0, len(yticks), ytick_step))
+        heat_ax.set_yticklabels(yticks[::ytick_step])
+        # Line plot
+        if highlight_position is not None:
+            colors = plt.cm.tab10.colors
+            for i, pos in enumerate(highlight_position):
+                try:
+                    if highlight_axis == "row":
+                        closest = find_closest_index(mat.index, pos)
+                        series = mat.loc[closest]
+                        x_vals = pd.to_numeric(series.index, errors="coerce")
+                        idx = mat.index.get_loc(closest)
+                        heat_ax.axhline(idx, color=colors[i % len(colors)], linestyle="--", linewidth=1)
+                        label = f"Row {pos} → {closest}"
+                    else:
+                        closest = find_closest_index(mat.columns, pos)
+                        series = mat[closest]
+                        x_vals = pd.to_numeric(series.index, errors="coerce")
+                        idx = mat.columns.get_loc(closest)
+                        heat_ax.axvline(idx, color=colors[i % len(colors)], linestyle="--", linewidth=1)
+                        label = f"Col {pos} → {closest}"
+                    line = line_ax.plot(x_vals, series.values, marker='o', label=label, color=colors[i % len(colors)])
+                    # Annotate each point
+                    if annotate_points:
+                        for x, y in zip(x_vals, series.values):
+                            if not np.isnan(y):
+                                line_ax.annotate(
+                                    f"{y:.2f}",
+                                    xy=(x, y),
+                                    textcoords="offset points",
+                                    xytext=(0, 5),
+                                    ha='center',
+                                    fontsize=8
+                                )
+                except Exception as e:
+                    line_ax.text(0.5, 0.5, f"⚠️ Error plotting {highlight_axis} @ {pos}",
+                                 ha='center', va='center', fontsize=10)
+                    print(f"Error plotting line for {highlight_axis}={pos}: {e}")
+            line_ax.set_title(f"{highlight_axis.capitalize()} Profile(s)")
+            line_ax.set_xlabel(f"{'Column' if highlight_axis == 'row' else 'Row'} position")
+            line_ax.set_ylabel("Value")
+            line_ax.grid(True)
+            line_ax.legend(fontsize=8)
+        plt.tight_layout()
+        # Save if requested
+        if save_path:
+            os.makedirs(save_path, exist_ok=True)
+            safe_name = group.replace("=", "").replace("__", "_").replace(",", "_")
+            out_file = os.path.join(save_path, f"{key}_{safe_name}.png")
+            plt.savefig(out_file, dpi=300)
+            print(f"📁 Saved: {out_file}")
+        plt.show()
+def plot_positionwise_matrix_grid(
+    adata,
+    key,
+    outer_keys=["Reference_strand", "activity_status"],
+    inner_keys=["Promoter_Open", "Enhancer_Open"],
+    log_transform=None,
+    vmin=None,
+    vmax=None,
+    cmap="vlag",
+    save_path=None,
+    figsize=(10, 10),
+    xtick_step=10,
+    ytick_step=10,
+    parallel=False,
+    max_threads=None
+):
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+    import numpy as np
+    import pandas as pd
+    import os
+    from matplotlib.gridspec import GridSpec
+    from joblib import Parallel, delayed
+    matrices = adata.uns[key]
+    group_labels = list(matrices.keys())
+    parsed_inner = pd.DataFrame([dict(zip(inner_keys, g.split("_")[-len(inner_keys):])) for g in group_labels])
+    parsed_outer = pd.Series(["_".join(g.split("_")[:-len(inner_keys)]) for g in group_labels], name="outer")
+    parsed = pd.concat([parsed_outer, parsed_inner], axis=1)
+    def plot_one_grid(outer_label):
+        selected = parsed[parsed['outer'] == outer_label].copy()
+        selected["group_str"] = [f"{outer_label}_{row[inner_keys[0]]}_{row[inner_keys[1]]}" for _, row in selected.iterrows()]
+        row_vals = sorted(selected[inner_keys[0]].unique())
+        col_vals = sorted(selected[inner_keys[1]].unique())
+        fig = plt.figure(figsize=figsize)
+        gs = GridSpec(len(row_vals), len(col_vals) + 1, width_ratios=[1]*len(col_vals) + [0.05], wspace=0.3)
+        axes = np.empty((len(row_vals), len(col_vals)), dtype=object)
+        local_vmin, local_vmax = vmin, vmax
+        if log_transform == "log2" and (vmin is None or vmax is None):
+            all_data = []
+            for group_str in selected["group_str"]:
+                mat = matrices.get(group_str)
+                if mat is not None:
+                    all_data.append(np.log2(mat.replace(0, 1e-9).values))
+            if all_data:
+                combined = np.concatenate([arr.flatten() for arr in all_data])
+                vmax_auto = np.nanmax(np.abs(combined))
+                local_vmin = -vmax_auto if vmin is None else vmin
+                local_vmax = vmax_auto if vmax is None else vmax
+        cbar_label = {
+            "log2": "log2(Value)",
+            "log1p": "log1p(Value)"
+        }.get(log_transform, "Value")
+        cbar_ax = fig.add_subplot(gs[:, -1])
+        for i, row_val in enumerate(row_vals):
+            for j, col_val in enumerate(col_vals):
+                group_label = f"{outer_label}_{row_val}_{col_val}"
+                ax = fig.add_subplot(gs[i, j])
+                axes[i, j] = ax
+                mat = matrices.get(group_label)
+                if mat is None:
+                    ax.axis("off")
+                    continue
+                data = mat.copy()
+                if log_transform == "log2":
+                    data = np.log2(data.replace(0, 1e-9))
+                elif log_transform == "log1p":
+                    data = np.log1p(data)
+                sns.heatmap(
+                    data,
+                    ax=ax,
+                    cmap=cmap,
+                    xticklabels=True,
+                    yticklabels=True,
+                    square=True,
+                    vmin=local_vmin,
+                    vmax=local_vmax,
+                    cbar=(i == 0 and j == 0),
+                    cbar_ax=cbar_ax if (i == 0 and j == 0) else None,
+                    cbar_kws={"label": cbar_label if (i == 0 and j == 0) else ""}
+                )
+                ax.set_title(f"{inner_keys[0]}={row_val}, {inner_keys[1]}={col_val}", fontsize=9, pad=8)
+                xticks = data.columns.astype(int)
+                yticks = data.index.astype(int)
+                ax.set_xticks(np.arange(0, len(xticks), xtick_step))
+                ax.set_xticklabels(xticks[::xtick_step], rotation=90)
+                ax.set_yticks(np.arange(0, len(yticks), ytick_step))
+                ax.set_yticklabels(yticks[::ytick_step])
+        fig.suptitle(f"{key} • {outer_label}", fontsize=14, y=1.02)
+        fig.tight_layout(rect=[0, 0, 0.97, 0.95])
+        if save_path:
+            os.makedirs(save_path, exist_ok=True)
+            fname = outer_label.replace("_", "").replace("=", "") + ".png"
+            plt.savefig(os.path.join(save_path, fname), dpi=300, bbox_inches='tight')
+            print(f"✅ Saved {fname}")
+        plt.close(fig)
+    if parallel:
+        Parallel(n_jobs=max_threads)(delayed(plot_one_grid)(outer_label) for outer_label in parsed['outer'].unique())
+    else:
+        for outer_label in parsed['outer'].unique():
+            plot_one_grid(outer_label)
+    print("✅ Finished plotting all grids.")

smftools/preprocessing/__init__.py CHANGED Viewed

@@ -6,13 +6,13 @@ from .calculate_coverage import calculate_coverage
 from .calculate_position_Youden import calculate_position_Youden
 from .calculate_read_length_stats import calculate_read_length_stats
 from .clean_NaN import clean_NaN
+from .filter_adata_by_nan_proportion import filter_adata_by_nan_proportion
 from .filter_converted_reads_on_methylation import filter_converted_reads_on_methylation
 from .filter_reads_on_length import filter_reads_on_length
 from .invert_adata import invert_adata
 from .load_sample_sheet import load_sample_sheet
-from .mark_duplicates import mark_duplicates
-from .remove_duplicates import remove_duplicates
-from .recipes import recipe_1_Kissiov_and_McKenna_2025, recipe_2_Kissiov_and_McKenna_2025
+from .flag_duplicate_reads import flag_duplicate_reads
+from .subsample_adata import subsample_adata
 __all__ = [
     "append_C_context",
@@ -23,12 +23,11 @@ __all__ = [
     "calculate_position_Youden",
     "calculate_read_length_stats",
     "clean_NaN",
+    "filter_adata_by_nan_proportion",
     "filter_converted_reads_on_methylation",
     "filter_reads_on_length",
     "invert_adata",
     "load_sample_sheet",
-    "mark_duplicates",
-    "remove_duplicates",
-    "recipe_1_Kissiov_and_McKenna_2025",
-    "recipe_2_Kissiov_and_McKenna_2025"
+    "flag_duplicate_reads",
+    "subsample_adata"
 ]

smftools/preprocessing/append_C_context.py CHANGED Viewed

@@ -2,7 +2,7 @@
 ## Conversion SMF Specific
 # Read methylation QC
-def append_C_context(adata, obs_column='Reference', use_consensus=False):
+def append_C_context(adata, obs_column='Reference', use_consensus=False, native=False):
     """
     Adds Cytosine context to the position within the given category. When use_consensus is True, it uses the consensus sequence, otherwise it defaults to the FASTA sequence.
@@ -10,14 +10,17 @@ def append_C_context(adata, obs_column='Reference', use_consensus=False):
         adata (AnnData): The input adata object.
         obs_column (str): The observation column in which to stratify on. Default is 'Reference', which should not be changed for most purposes.
         use_consensus (bool): A truth statement indicating whether to use the consensus sequence from the reads mapped to the reference. If False, the reference FASTA is used instead.
-    Input: An adata object, the obs_column of interst, and whether to use the consensus sequence from the category.
+        native (bool): If False, perform conversion SMF assumptions. If True, perform native SMF assumptions
     Returns:
         None
     """
     import numpy as np
     import anndata as ad
-    site_types = ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C']
+    print('Adding Cytosine context based on reference FASTA sequence for sample')
+    site_types = ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C', 'any_C_site']
     categories = adata.obs[obs_column].cat.categories
     for cat in categories:
         # Assess if the strand is the top or bottom strand converted
@@ -26,11 +29,20 @@ def append_C_context(adata, obs_column='Reference', use_consensus=False):
         elif 'bottom' in cat:
             strand = 'bottom'
-        if use_consensus:
-            sequence = adata.uns[f'{cat}_consensus_sequence']
+        if native:
+            basename = cat.split(f"_{strand}")[0]
+            if use_consensus:
+                sequence = adata.uns[f'{basename}_consensus_sequence']
+            else:
+                # This sequence is the unconverted FASTA sequence of the original input FASTA for the locus
+                sequence = adata.uns[f'{basename}_FASTA_sequence']
         else:
-            # This sequence is the unconverted FASTA sequence of the original input FASTA for the locus
-            sequence = adata.uns[f'{cat}_FASTA_sequence']
+            basename = cat.split(f"_{strand}")[0]
+            if use_consensus:
+                sequence = adata.uns[f'{basename}_consensus_sequence']
+            else:
+                # This sequence is the unconverted FASTA sequence of the original input FASTA for the locus
+                sequence = adata.uns[f'{basename}_FASTA_sequence']
         # Init a dict keyed by reference site type that points to a bool of whether the position is that site type.
         boolean_dict = {}
         for site_type in site_types:
@@ -40,6 +52,7 @@ def append_C_context(adata, obs_column='Reference', use_consensus=False):
             # Iterate through the sequence and apply the criteria
             for i in range(1, len(sequence) - 1):
                 if sequence[i] == 'C':
+                    boolean_dict[f'{cat}_any_C_site'][i] = True
                     if sequence[i - 1] == 'G' and sequence[i + 1] != 'G':
                         boolean_dict[f'{cat}_GpC_site'][i] = True
                     elif sequence[i - 1] == 'G' and sequence[i + 1] == 'G':
@@ -52,6 +65,7 @@ def append_C_context(adata, obs_column='Reference', use_consensus=False):
             # Iterate through the sequence and apply the criteria
             for i in range(1, len(sequence) - 1):
                 if sequence[i] == 'G':
+                    boolean_dict[f'{cat}_any_C_site'][i] = True
                     if sequence[i + 1] == 'C' and sequence[i - 1] != 'C':
                         boolean_dict[f'{cat}_GpC_site'][i] = True
                     elif sequence[i - 1] == 'C' and sequence[i + 1] == 'C':
@@ -65,5 +79,4 @@ def append_C_context(adata, obs_column='Reference', use_consensus=False):
         for site_type in site_types:
             adata.var[f'{cat}_{site_type}'] = boolean_dict[f'{cat}_{site_type}'].astype(bool)
-            adata.obsm[f'{cat}_{site_type}'] = adata[:, adata.var[f'{cat}_{site_type}'] == True].copy().X
+            adata.obsm[f'{cat}_{site_type}'] = adata[:, adata.var[f'{cat}_{site_type}'] == True].X

smftools 0.1.3__py3-none-any.whl → 0.1.7__py3-none-any.whl

smftools 0.1.3py3-none-any.whl → 0.1.7py3-none-any.whl