PyPI - smftools - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl - Mend

smftools 0.2.1py3-none-any.whl → 0.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (114) hide show

smftools/plotting/position_stats.py CHANGED Viewed

@@ -90,7 +90,7 @@ def plot_volcano_relative_risk(
                 safe_name = f"{ref}_{group_label}".replace("=", "").replace("__", "_").replace(",", "_").replace(" ", "_")
                 out_file = os.path.join(save_path, f"{safe_name}.png")
                 plt.savefig(out_file, dpi=300)
-                print(f"📁 Saved: {out_file}")
+                print(f"Saved: {out_file}")
             plt.show()
@@ -449,7 +449,7 @@ def plot_positionwise_matrix_grid(
             os.makedirs(save_path, exist_ok=True)
             fname = outer_label.replace("_", "").replace("=", "") + ".png"
             plt.savefig(os.path.join(save_path, fname), dpi=300, bbox_inches='tight')
-            print(f"✅ Saved {fname}")
+            print(f"Saved {fname}")
         plt.close(fig)
@@ -459,4 +459,4 @@ def plot_positionwise_matrix_grid(
         for outer_label in parsed['outer'].unique():
             plot_one_grid(outer_label)
-    print("✅ Finished plotting all grids.")
+    print("Finished plotting all grids.")

smftools/preprocessing/__init__.py CHANGED Viewed

@@ -1,8 +1,7 @@
-from .add_read_length_and_mapping_qc import add_read_length_and_mapping_qc
 from .append_base_context import append_base_context
 from .append_binary_layer_by_base_context import append_binary_layer_by_base_context
 from .binarize_on_Youden import binarize_on_Youden
-from .calculate_complexity import calculate_complexity
+from .binarize import binarize_adata
 from .calculate_complexity_II import calculate_complexity_II
 from .calculate_read_modification_stats import calculate_read_modification_stats
 from .calculate_coverage import calculate_coverage
@@ -15,14 +14,15 @@ from .filter_reads_on_length_quality_mapping import filter_reads_on_length_quali
 from .invert_adata import invert_adata
 from .load_sample_sheet import load_sample_sheet
 from .flag_duplicate_reads import flag_duplicate_reads
+from .reindex_references_adata import reindex_references_adata
 from .subsample_adata import subsample_adata
 __all__ = [
-    "add_read_length_and_mapping_qc",
     "append_base_context",
     "append_binary_layer_by_base_context",
     "binarize_on_Youden",
-    "calculate_complexity",
+    "binarize_adata",
+    "calculate_complexity_II",
     "calculate_read_modification_stats",
     "calculate_coverage",
     "calculate_position_Youden",

smftools/preprocessing/append_base_context.py CHANGED Viewed

@@ -1,18 +1,19 @@
 def append_base_context(adata,
-                        obs_column='Reference_strand',
+                        ref_column='Reference_strand',
                         use_consensus=False,
                         native=False,
                         mod_target_bases=['GpC', 'CpG'],
                         bypass=False,
                         force_redo=False,
-                        uns_flag='base_context_added'
+                        uns_flag='append_base_context_performed'
 ):
     """
     Adds nucleobase context to the position within the given category. When use_consensus is True, it uses the consensus sequence, otherwise it defaults to the FASTA sequence.
+    This needs to be performed prior to AnnData inversion step.
     Parameters:
         adata (AnnData): The input adata object.
-        obs_column (str): The observation column in which to stratify on. Default is 'Reference_strand', which should not be changed for most purposes.
+        ref_column (str): The observation column in which to stratify on. Default is 'Reference_strand', which should not be changed for most purposes.
         use_consensus (bool): A truth statement indicating whether to use the consensus sequence from the reads mapped to the reference. If False, the reference FASTA is used instead.
         native (bool): If False, perform conversion SMF assumptions. If True, perform native SMF assumptions
         mod_target_bases (list): Base contexts that may be modified.
@@ -30,68 +31,69 @@ def append_base_context(adata,
         return
     print('Adding base context based on reference FASTA sequence for sample')
-    categories = adata.obs[obs_column].cat.categories
+    references = adata.obs[ref_column].cat.categories
     site_types = []
     if any(base in mod_target_bases for base in ['GpC', 'CpG', 'C']):
-        site_types += ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C_site', 'any_C_site']
+        site_types += ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C_site', 'C_site']
     if 'A' in mod_target_bases:
         site_types += ['A_site']
-    for cat in categories:
+    for ref in references:
         # Assess if the strand is the top or bottom strand converted
-        if 'top' in cat:
+        if 'top' in ref:
             strand = 'top'
-        elif 'bottom' in cat:
+        elif 'bottom' in ref:
             strand = 'bottom'
         if native:
-            basename = cat.split(f"_{strand}")[0]
+            basename = ref.split(f"_{strand}")[0]
             if use_consensus:
                 sequence = adata.uns[f'{basename}_consensus_sequence']
             else:
                 # This sequence is the unconverted FASTA sequence of the original input FASTA for the locus
                 sequence = adata.uns[f'{basename}_FASTA_sequence']
         else:
-            basename = cat.split(f"_{strand}")[0]
+            basename = ref.split(f"_{strand}")[0]
             if use_consensus:
                 sequence = adata.uns[f'{basename}_consensus_sequence']
             else:
                 # This sequence is the unconverted FASTA sequence of the original input FASTA for the locus
                 sequence = adata.uns[f'{basename}_FASTA_sequence']
         # Init a dict keyed by reference site type that points to a bool of whether the position is that site type.
         boolean_dict = {}
         for site_type in site_types:
-            boolean_dict[f'{cat}_{site_type}'] = np.full(len(sequence), False, dtype=bool)
+            boolean_dict[f'{ref}_{site_type}'] = np.full(len(sequence), False, dtype=bool)
         if any(base in mod_target_bases for base in ['GpC', 'CpG', 'C']):
             if strand == 'top':
                 # Iterate through the sequence and apply the criteria
                 for i in range(1, len(sequence) - 1):
                     if sequence[i] == 'C':
-                        boolean_dict[f'{cat}_any_C_site'][i] = True
+                        boolean_dict[f'{ref}_C_site'][i] = True
                         if sequence[i - 1] == 'G' and sequence[i + 1] != 'G':
-                            boolean_dict[f'{cat}_GpC_site'][i] = True
+                            boolean_dict[f'{ref}_GpC_site'][i] = True
                         elif sequence[i - 1] == 'G' and sequence[i + 1] == 'G':
-                            boolean_dict[f'{cat}_ambiguous_GpC_CpG_site'][i] = True
+                            boolean_dict[f'{ref}_ambiguous_GpC_CpG_site'][i] = True
                         elif sequence[i - 1] != 'G' and sequence[i + 1] == 'G':
-                            boolean_dict[f'{cat}_CpG_site'][i] = True
+                            boolean_dict[f'{ref}_CpG_site'][i] = True
                         elif sequence[i - 1] != 'G' and sequence[i + 1] != 'G':
-                            boolean_dict[f'{cat}_other_C_site'][i] = True
+                            boolean_dict[f'{ref}_other_C_site'][i] = True
             elif strand == 'bottom':
                 # Iterate through the sequence and apply the criteria
                 for i in range(1, len(sequence) - 1):
                     if sequence[i] == 'G':
-                        boolean_dict[f'{cat}_any_C_site'][i] = True
+                        boolean_dict[f'{ref}_C_site'][i] = True
                         if sequence[i + 1] == 'C' and sequence[i - 1] != 'C':
-                            boolean_dict[f'{cat}_GpC_site'][i] = True
+                            boolean_dict[f'{ref}_GpC_site'][i] = True
                         elif sequence[i - 1] == 'C' and sequence[i + 1] == 'C':
-                            boolean_dict[f'{cat}_ambiguous_GpC_CpG_site'][i] = True
+                            boolean_dict[f'{ref}_ambiguous_GpC_CpG_site'][i] = True
                         elif sequence[i - 1] == 'C' and sequence[i + 1] != 'C':
-                            boolean_dict[f'{cat}_CpG_site'][i] = True
+                            boolean_dict[f'{ref}_CpG_site'][i] = True
                         elif sequence[i - 1] != 'C' and sequence[i + 1] != 'C':
-                            boolean_dict[f'{cat}_other_C_site'][i] = True
+                            boolean_dict[f'{ref}_other_C_site'][i] = True
             else:
                 print('Error: top or bottom strand of conversion could not be determined. Ensure this value is in the Reference name.')
@@ -100,21 +102,28 @@ def append_base_context(adata,
                 # Iterate through the sequence and apply the criteria
                 for i in range(1, len(sequence) - 1):
                     if sequence[i] == 'A':
-                        boolean_dict[f'{cat}_A_site'][i] = True
+                        boolean_dict[f'{ref}_A_site'][i] = True
             elif strand == 'bottom':
                 # Iterate through the sequence and apply the criteria
                 for i in range(1, len(sequence) - 1):
                     if sequence[i] == 'T':
-                        boolean_dict[f'{cat}_A_site'][i] = True
+                        boolean_dict[f'{ref}_A_site'][i] = True
             else:
                 print('Error: top or bottom strand of conversion could not be determined. Ensure this value is in the Reference name.')
         for site_type in site_types:
-            adata.var[f'{cat}_{site_type}'] = boolean_dict[f'{cat}_{site_type}'].astype(bool)
+            # Site context annotations for each reference
+            adata.var[f'{ref}_{site_type}'] = boolean_dict[f'{ref}_{site_type}'].astype(bool)
+            # Restrict the site type labels to only be in positions that occur at a high enough frequency in the dataset
+            if adata.uns["calculate_coverage_performed"] == True:
+                adata.var[f'{ref}_{site_type}'] = (adata.var[f'{ref}_{site_type}']) & (adata.var[f'position_in_{ref}'])
+            else:
+                pass
             if native:
-                adata.obsm[f'{cat}_{site_type}'] = adata[:, adata.var[f'{cat}_{site_type}'] == True].layers['binarized_methylation']
+                adata.obsm[f'{ref}_{site_type}'] = adata[:, adata.var[f'{ref}_{site_type}'] == True].layers['binarized_methylation']
             else:
-                adata.obsm[f'{cat}_{site_type}'] = adata[:, adata.var[f'{cat}_{site_type}'] == True].X
+                adata.obsm[f'{ref}_{site_type}'] = adata[:, adata.var[f'{ref}_{site_type}'] == True].X
     # mark as done
     adata.uns[uns_flag] = True

smftools/preprocessing/append_binary_layer_by_base_context.py CHANGED Viewed

@@ -6,7 +6,7 @@ def append_binary_layer_by_base_context(
     reference_column: str,
     smf_modality: str = "conversion",
     verbose: bool = True,
-    uns_flag: str = "binary_layers_by_base_context_added",
+    uns_flag: str = "append_binary_layer_by_base_context_performed",
     bypass: bool = False,
     force_redo: bool = False
 ):
@@ -15,7 +15,7 @@ def append_binary_layer_by_base_context(
       - GpC_site_binary
       - CpG_site_binary
       - GpC_CpG_combined_site_binary (numeric sum where present; NaN where neither present)
-      - any_C_site_binary
+      - C_site_binary
       - other_C_site_binary
     Behavior:
@@ -27,7 +27,7 @@ def append_binary_layer_by_base_context(
     # Only run if not already performed
     already = bool(adata.uns.get(uns_flag, False))
-    if (already and not force_redo) or bypass or ("base_context_added" not in adata.uns):
+    if (already and not force_redo) or bypass or ("append_base_context_performed" not in adata.uns):
         # QC already performed; nothing to do
         return adata
@@ -48,7 +48,7 @@ def append_binary_layer_by_base_context(
     references = adata.obs[reference_column].astype("category").cat.categories
     reference_to_gpc_column = {ref: f"{ref}_GpC_site" for ref in references}
     reference_to_cpg_column = {ref: f"{ref}_CpG_site" for ref in references}
-    reference_to_c_column = {ref: f"{ref}_any_C_site" for ref in references}
+    reference_to_c_column = {ref: f"{ref}_C_site" for ref in references}
     reference_to_other_c_column = {ref: f"{ref}_other_C_site" for ref in references}
     # verify var columns exist and build boolean masks per ref (len = n_vars)
@@ -124,7 +124,7 @@ def append_binary_layer_by_base_context(
     adata.layers['GpC_site_binary'] = masked_gpc
     adata.layers['CpG_site_binary'] = masked_cpg
     adata.layers['GpC_CpG_combined_site_binary'] = combined_sum
-    adata.layers['any_C_site_binary'] = masked_any_c
+    adata.layers['C_site_binary'] = masked_any_c
     adata.layers['other_C_site_binary'] = masked_other_c
     if verbose:
@@ -134,7 +134,7 @@ def append_binary_layer_by_base_context(
         print(f"  GpC: {_filled_positions(masked_gpc)}")
         print(f"  CpG: {_filled_positions(masked_cpg)}")
         print(f"  GpC+CpG combined: {_filled_positions(combined_sum)}")
-        print(f"  any_C: {_filled_positions(masked_any_c)}")
+        print(f"  C: {_filled_positions(masked_any_c)}")
         print(f"  other_C: {_filled_positions(masked_other_c)}")
     # mark as done

smftools/preprocessing/binarize.py ADDED Viewed

@@ -0,0 +1,17 @@
+import numpy as np
+def binarize_adata(adata, source="X", target_layer="binary", threshold=0.8):
+    """
+    Binarize a dense matrix and preserve NaN.
+    source: "X" or layer name
+    """
+    X = adata.X if source == "X" else adata.layers[source]
+    # Copy to avoid modifying original in-place
+    X_bin = X.copy()
+    # Where not NaN: apply threshold
+    mask = ~np.isnan(X_bin)
+    X_bin[mask] = (X_bin[mask] > threshold).astype(np.int8)
+    adata.layers[target_layer] = X_bin

smftools/preprocessing/binarize_on_Youden.py CHANGED Viewed

@@ -1,4 +1,6 @@
-def binarize_on_Youden(adata, obs_column='Reference'):
+def binarize_on_Youden(adata,
+                       ref_column='Reference_strand',
+                       output_layer_name='binarized_methylation'):
     """
     Binarize SMF values based on position thresholds determined by calculate_position_Youden.
@@ -16,18 +18,18 @@ def binarize_on_Youden(adata, obs_column='Reference'):
     binarized_methylation = np.full_like(adata.X, np.nan, dtype=float)  # Keeps same shape as adata.X
     # Get unique categories
-    categories = adata.obs[obs_column].cat.categories
+    references = adata.obs[ref_column].cat.categories
-    for cat in categories:
+    for ref in references:
         # Select subset for this category
-        cat_mask = adata.obs[obs_column] == cat
-        cat_subset = adata[cat_mask]
+        ref_mask = adata.obs[ref_column] == ref
+        ref_subset = adata[ref_mask]
         # Extract the probability matrix
-        original_matrix = cat_subset.X.copy()
+        original_matrix = ref_subset.X.copy()
         # Extract the thresholds for each position efficiently
-        thresholds = np.array(cat_subset.var[f'{cat}_position_methylation_thresholding_Youden_stats'].apply(lambda x: x[0]))
+        thresholds = np.array(ref_subset.var[f'{ref}_position_methylation_thresholding_Youden_stats'].apply(lambda x: x[0]))
         # Identify NaN values
         nan_mask = np.isnan(original_matrix)
@@ -39,7 +41,7 @@ def binarize_on_Youden(adata, obs_column='Reference'):
         binarized_matrix[nan_mask] = np.nan
         # Assign the binarized values back into the preallocated storage
-        binarized_methylation[cat_mask, :] = binarized_matrix
+        binarized_methylation[ref_subset, :] = binarized_matrix
     # Store the binarized matrix in a new layer
-    adata.layers['binarized_methylation'] = binarized_methylation
+    adata.layers[output_layer_name] = binarized_methylation

smftools/preprocessing/calculate_complexity_II.py CHANGED Viewed

@@ -11,7 +11,7 @@ def calculate_complexity_II(
     n_depths=12,
     random_state=0,
     csv_summary=True,
-    uns_flag='complexity_analysis_complete',
+    uns_flag='calculate_complexity_II_performed',
     force_redo=False,
     bypass=False
 ):

smftools/preprocessing/calculate_coverage.py CHANGED Viewed

@@ -1,4 +1,7 @@
-def calculate_coverage(adata, obs_column='Reference_strand', position_nan_threshold=0.00001, uns_flag='positional_coverage_calculated'):
+def calculate_coverage(adata,
+                       ref_column='Reference_strand',
+                       position_nan_threshold=0.01,
+                       uns_flag='calculate_coverage_performed'):
     """
     Append position-level metadata regarding whether the position is informative within the given observation category.
@@ -20,32 +23,32 @@ def calculate_coverage(adata, obs_column='Reference_strand', position_nan_thresh
         # QC already performed; nothing to do
         return
-    categories = adata.obs[obs_column].cat.categories
+    references = adata.obs[ref_column].cat.categories
     n_categories_with_position = np.zeros(adata.shape[1])
-    # Loop over categories
-    for cat in categories:
-        print(f'Assessing positional coverage across samples for {cat} reference')
+    # Loop over references
+    for ref in references:
+        print(f'Assessing positional coverage across samples for {ref} reference')
         # Subset to current category
-        cat_mask = adata.obs[obs_column] == cat
-        temp_cat_adata = adata[cat_mask]
+        ref_mask = adata.obs[ref_column] == ref
+        temp_ref_adata = adata[ref_mask]
         # Compute fraction of valid coverage
-        cat_valid_coverage = np.sum(~np.isnan(temp_cat_adata.X), axis=0)
-        cat_valid_fraction = cat_valid_coverage / temp_cat_adata.shape[0]  # Avoid extra computation
+        ref_valid_coverage = np.sum(~np.isnan(temp_ref_adata.X), axis=0)
+        ref_valid_fraction = ref_valid_coverage / temp_ref_adata.shape[0]  # Avoid extra computation
         # Store coverage stats
-        adata.var[f'{cat}_valid_fraction'] = pd.Series(cat_valid_fraction, index=adata.var.index)
+        adata.var[f'{ref}_valid_fraction'] = pd.Series(ref_valid_fraction, index=adata.var.index)
         # Assign whether the position is covered based on threshold
-        adata.var[f'position_in_{cat}'] = cat_valid_fraction >= position_nan_threshold
+        adata.var[f'position_in_{ref}'] = ref_valid_fraction >= position_nan_threshold
         # Sum the number of categories covering each position
-        n_categories_with_position += adata.var[f'position_in_{cat}'].values
+        n_categories_with_position += adata.var[f'position_in_{ref}'].values
     # Store final category count
-    adata.var[f'N_{obs_column}_with_position'] = n_categories_with_position.astype(int)
+    adata.var[f'N_{ref_column}_with_position'] = n_categories_with_position.astype(int)
     # mark as done
     adata.uns[uns_flag] = True

smftools/preprocessing/calculate_position_Youden.py CHANGED Viewed

@@ -1,7 +1,15 @@
 ## calculate_position_Youden
 ## Calculating and applying position level thresholds for methylation calls to binarize the SMF data
-def calculate_position_Youden(adata, positive_control_sample='positive', negative_control_sample='negative', J_threshold=0.5, obs_column='Reference', infer_on_percentile=False, inference_variable='', save=False, output_directory=''):
+def calculate_position_Youden(adata,
+                              positive_control_sample=None,
+                              negative_control_sample=None,
+                              J_threshold=0.5,
+                              ref_column='Reference_strand',
+                              sample_column='Sample_names',
+                              infer_on_percentile=True,
+                              inference_variable='Raw_modification_signal',
+                              save=False,
+                              output_directory=''):
     """
     Adds new variable metadata to each position indicating whether the position provides reliable SMF methylation calls. Also outputs plots of the positional ROC curves.
@@ -26,28 +34,36 @@ def calculate_position_Youden(adata, positive_control_sample='positive', negativ
     from sklearn.metrics import roc_curve, roc_auc_score
     control_samples = [positive_control_sample, negative_control_sample]
-    categories = adata.obs[obs_column].cat.categories
+    references = adata.obs[ref_column].cat.categories
     # Iterate over each category in the specified obs_column
-    for cat in categories:
-        print(f"Calculating position Youden statistics for {cat}")
+    for ref in references:
+        print(f"Calculating position Youden statistics for {ref}")
         # Subset to keep only reads associated with the category
-        cat_subset = adata[adata.obs[obs_column] == cat]
+        ref_subset = adata[adata.obs[ref_column] == ref]
         # Iterate over positive and negative control samples
-        for control in control_samples:
+        for i, control in enumerate(control_samples):
             # Initialize a dictionary for the given control sample. This will be keyed by dataset and position to point to a tuple of coordinate position and an array of methylation probabilities
-            adata.uns[f'{cat}_position_methylation_dict_{control}'] = {}
-            if infer_on_percentile:
-                sorted_column = cat_subset.obs[inference_variable].sort_values(ascending=False)
-                if control == "positive":
+            adata.uns[f'{ref}_position_methylation_dict_{control}'] = {}
+            # If controls are not passed and infer on percentile is True, infer thresholds based on top and bottom percentile windows for a given obs column metric.
+            if infer_on_percentile and not control:
+                sorted_column = ref_subset.obs[inference_variable].sort_values(ascending=False)
+                if i == 0:
+                    control == 'positive'
+                    positive_control_sample = control
                     threshold = np.percentile(sorted_column, 100 - infer_on_percentile)
-                    control_subset = cat_subset[cat_subset.obs[inference_variable] >= threshold, :]
+                    control_subset = ref_subset[ref_subset.obs[inference_variable] >= threshold, :]
                 else:
+                    control == 'negative'
+                    negative_control_sample = control
                     threshold = np.percentile(sorted_column, infer_on_percentile)
-                    control_subset = cat_subset[cat_subset.obs[inference_variable] <= threshold, :]
+                    control_subset = ref_subset[ref_subset.obs[inference_variable] <= threshold, :]
+            elif not infer_on_percentile and not control:
+                print("Can not threshold Anndata on Youden threshold. Need to either provide control samples or set infer_on_percentile to True")
+                return
             else:
                 # get the current control subset on the given category
-                filtered_obs = cat_subset.obs[cat_subset.obs['Sample_names'].str.contains(control, na=False, regex=True)]
-                control_subset = cat_subset[filtered_obs.index]
+                filtered_obs = ref_subset.obs[ref_subset.obs[sample_column] == control]
+                control_subset = ref_subset[filtered_obs.index]
             # Iterate through every position in the control subset
             for position in range(control_subset.shape[1]):
                 # Get the coordinate name associated with that position
@@ -63,9 +79,9 @@ def calculate_position_Youden(adata, positive_control_sample='positive', negativ
                 # Get fraction coverage
                 fraction_coverage = position_coverage / control_subset.shape[0]
                 # Save the position and the position methylation data for the control subset
-                adata.uns[f'{cat}_position_methylation_dict_{control}'][f'{position}'] = (position, position_data, fraction_coverage)
+                adata.uns[f'{ref}_position_methylation_dict_{control}'][f'{position}'] = (position, position_data, fraction_coverage)
-    for cat in categories:
+    for ref in references:
         fig, ax = plt.subplots(figsize=(6, 4))
         plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
         plt.xlabel('False Positive Rate')
@@ -76,13 +92,13 @@ def calculate_position_Youden(adata, positive_control_sample='positive', negativ
         n_total_positions = 0
         # Initialize a list that will hold the positional thresholds for the category
         probability_thresholding_list = [(np.nan, np.nan)] * adata.shape[1]
-        for i, key in enumerate(adata.uns[f'{cat}_position_methylation_dict_{positive_control_sample}'].keys()):
-            position = int(adata.uns[f'{cat}_position_methylation_dict_{positive_control_sample}'][key][0])
-            positive_position_array = adata.uns[f'{cat}_position_methylation_dict_{positive_control_sample}'][key][1]
-            fraction_coverage = adata.uns[f'{cat}_position_methylation_dict_{positive_control_sample}'][key][2]
+        for i, key in enumerate(adata.uns[f'{ref}_position_methylation_dict_{positive_control_sample}'].keys()):
+            position = int(adata.uns[f'{ref}_position_methylation_dict_{positive_control_sample}'][key][0])
+            positive_position_array = adata.uns[f'{ref}_position_methylation_dict_{positive_control_sample}'][key][1]
+            fraction_coverage = adata.uns[f'{ref}_position_methylation_dict_{positive_control_sample}'][key][2]
             if fraction_coverage > 0.2:
                 try:
-                    negative_position_array = adata.uns[f'{cat}_position_methylation_dict_{negative_control_sample}'][key][1]
+                    negative_position_array = adata.uns[f'{ref}_position_methylation_dict_{negative_control_sample}'][key][1]
                     # Combine the negative and positive control data
                     data = np.concatenate([negative_position_array, positive_position_array])
                     labels = np.array([0] * len(negative_position_array) + [1] * len(positive_position_array))
@@ -101,15 +117,15 @@ def calculate_position_Youden(adata, positive_control_sample='positive', negativ
                         plt.plot(fpr, tpr, label='ROC curve')
                 except:
                     probability_thresholding_list[position] = (0.8, np.nan)
-        title = f'ROC Curve for {n_passed_positions} positions with J-stat greater than {J_threshold}\n out of {n_total_positions} total positions on {cat}'
+        title = f'ROC Curve for {n_passed_positions} positions with J-stat greater than {J_threshold}\n out of {n_total_positions} total positions on {ref}'
         plt.title(title)
-        save_name = output_directory + f'/{title}'
+        save_name = output_directory / f"{title}.png"
         if save:
             plt.savefig(save_name)
             plt.close()
         else:
             plt.show()
-        adata.var[f'{cat}_position_methylation_thresholding_Youden_stats'] = probability_thresholding_list
+        adata.var[f'{ref}_position_methylation_thresholding_Youden_stats'] = probability_thresholding_list
         J_max_list = [probability_thresholding_list[i][1] for i in range(adata.shape[1])]
-        adata.var[f'{cat}_position_passed_QC'] = [True if i > J_threshold else False for i in J_max_list]
+        adata.var[f'{ref}_position_passed_QC'] = [True if i > J_threshold else False for i in J_max_list]

smftools/preprocessing/calculate_read_modification_stats.py CHANGED Viewed

@@ -2,7 +2,7 @@ def calculate_read_modification_stats(adata,
                                       reference_column,
                                       sample_names_col,
                                       mod_target_bases,
-                                      uns_flag="read_modification_stats_calculated",
+                                      uns_flag="calculate_read_modification_stats_performed",
                                       bypass=False,
                                       force_redo=False
 ):
@@ -36,7 +36,7 @@ def calculate_read_modification_stats(adata,
     site_types = []
     if any(base in mod_target_bases for base in ['GpC', 'CpG', 'C']):
-        site_types += ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C_site', 'any_C_site']
+        site_types += ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C_site', 'C_site']
     if 'A' in mod_target_bases:
         site_types += ['A_site']

smftools/preprocessing/filter_reads_on_length_quality_mapping.py CHANGED Viewed

@@ -11,7 +11,7 @@ def filter_reads_on_length_quality_mapping(
     length_ratio: Optional[Sequence[float]] = None,         # e.g. [min, max]
     read_quality: Optional[Sequence[float]] = None,         # e.g. [min, max]  (commonly min only)
     mapping_quality: Optional[Sequence[float]] = None,      # e.g. [min, max]  (commonly min only)
-    uns_flag: str = "reads_removed_failing_length_quality_mapping_qc",
+    uns_flag: str = "filter_reads_on_length_quality_mapping_performed",
     bypass: bool = False,
     force_redo: bool = True
 ) -> ad.AnnData:

smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl

smftools 0.2.1py3-none-any.whl → 0.2.4py3-none-any.whl