smftools 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. smftools/_settings.py +3 -2
  2. smftools/_version.py +1 -1
  3. smftools/datasets/F1_sample_sheet.csv +5 -0
  4. smftools/datasets/datasets.py +8 -7
  5. smftools/informatics/__init__.py +7 -5
  6. smftools/informatics/{bam_conversion.py → archived/bam_conversion.py} +16 -4
  7. smftools/informatics/{bam_direct.py → archived/bam_direct.py} +22 -8
  8. smftools/informatics/archived/basecalls_to_adata.py +71 -0
  9. smftools/informatics/conversion_smf.py +79 -0
  10. smftools/informatics/direct_smf.py +89 -0
  11. smftools/informatics/fast5_to_pod5.py +8 -6
  12. smftools/informatics/helpers/__init__.py +18 -0
  13. smftools/informatics/helpers/align_and_sort_BAM.py +9 -13
  14. smftools/informatics/helpers/aligned_BAM_to_bed.py +73 -0
  15. smftools/informatics/helpers/bed_to_bigwig.py +39 -0
  16. smftools/informatics/helpers/binarize_converted_base_identities.py +2 -2
  17. smftools/informatics/helpers/canoncall.py +2 -0
  18. smftools/informatics/helpers/complement_base_list.py +21 -0
  19. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +54 -0
  20. smftools/informatics/helpers/converted_BAM_to_adata.py +161 -92
  21. smftools/informatics/helpers/count_aligned_reads.py +13 -9
  22. smftools/informatics/helpers/extract_base_identities.py +34 -20
  23. smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
  24. smftools/informatics/helpers/find_conversion_sites.py +11 -9
  25. smftools/informatics/helpers/generate_converted_FASTA.py +33 -14
  26. smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
  27. smftools/informatics/helpers/index_fasta.py +12 -0
  28. smftools/informatics/helpers/modcall.py +3 -1
  29. smftools/informatics/helpers/modkit_extract_to_adata.py +467 -316
  30. smftools/informatics/helpers/ohe_batching.py +52 -0
  31. smftools/informatics/helpers/one_hot_encode.py +10 -8
  32. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +52 -0
  33. smftools/informatics/helpers/separate_bam_by_bc.py +4 -2
  34. smftools/informatics/helpers/split_and_index_BAM.py +16 -4
  35. smftools/informatics/load_adata.py +127 -0
  36. smftools/informatics/subsample_fasta_from_bed.py +47 -0
  37. smftools/informatics/subsample_pod5.py +69 -13
  38. smftools/preprocessing/__init__.py +6 -1
  39. smftools/preprocessing/append_C_context.py +37 -14
  40. smftools/preprocessing/calculate_complexity.py +2 -2
  41. smftools/preprocessing/calculate_consensus.py +47 -0
  42. smftools/preprocessing/calculate_converted_read_methylation_stats.py +60 -9
  43. smftools/preprocessing/calculate_coverage.py +2 -2
  44. smftools/preprocessing/calculate_pairwise_hamming_distances.py +1 -1
  45. smftools/preprocessing/calculate_read_length_stats.py +56 -2
  46. smftools/preprocessing/clean_NaN.py +2 -2
  47. smftools/preprocessing/filter_converted_reads_on_methylation.py +4 -2
  48. smftools/preprocessing/filter_reads_on_length.py +4 -2
  49. smftools/preprocessing/invert_adata.py +1 -0
  50. smftools/preprocessing/load_sample_sheet.py +24 -0
  51. smftools/preprocessing/make_dirs.py +21 -0
  52. smftools/preprocessing/mark_duplicates.py +34 -19
  53. smftools/preprocessing/recipes.py +125 -0
  54. smftools/preprocessing/remove_duplicates.py +7 -4
  55. smftools/tools/apply_HMM.py +1 -0
  56. smftools/tools/cluster.py +0 -0
  57. smftools/tools/read_HMM.py +1 -0
  58. smftools/tools/subset_adata.py +32 -0
  59. smftools/tools/train_HMM.py +43 -0
  60. {smftools-0.1.1.dist-info → smftools-0.1.3.dist-info}/METADATA +13 -7
  61. smftools-0.1.3.dist-info/RECORD +84 -0
  62. smftools/informatics/basecalls_to_adata.py +0 -42
  63. smftools/informatics/pod5_conversion.py +0 -53
  64. smftools/informatics/pod5_direct.py +0 -55
  65. smftools/informatics/pod5_to_adata.py +0 -40
  66. smftools-0.1.1.dist-info/RECORD +0 -64
  67. {smftools-0.1.1.dist-info → smftools-0.1.3.dist-info}/WHEEL +0 -0
  68. {smftools-0.1.1.dist-info → smftools-0.1.3.dist-info}/licenses/LICENSE +0 -0
@@ -3,13 +3,17 @@
3
3
  ## Conversion SMF Specific
4
4
  # Read methylation QC
5
5
 
6
- def calculate_converted_read_methylation_stats(adata, obs_column='Reference'):
6
+ def calculate_converted_read_methylation_stats(adata, reference_column, sample_names_col, output_directory, show_methylation_histogram=False, save_methylation_histogram=False):
7
7
  """
8
- Adds methylation statistics for each read. Indicates whether the read GpC methylation exceeded other_C methylation (background false positives)
8
+ Adds methylation statistics for each read. Indicates whether the read GpC methylation exceeded other_C methylation (background false positives).
9
9
 
10
10
  Parameters:
11
- adata (AnnData): An AnnData object
12
- obs_column (str): observation category of interest
11
+ adata (AnnData): An adata object
12
+ reference_column (str): String representing the name of the Reference column to use
13
+ sample_names_col (str): String representing the name of the sample name column to use
14
+ output_directory (str): String representing the output directory to make and write out the histograms.
15
+ show_methylation_histogram (bool): Whether to display the histograms.
16
+ save_methylation_histogram (bool): Whether to save the histograms.
13
17
 
14
18
  Returns:
15
19
  None
@@ -17,16 +21,21 @@ def calculate_converted_read_methylation_stats(adata, obs_column='Reference'):
17
21
  import numpy as np
18
22
  import anndata as ad
19
23
  import pandas as pd
24
+ import matplotlib.pyplot as plt
25
+ from .. import readwrite
26
+
27
+ references = set(adata.obs[reference_column])
28
+ sample_names = set(adata.obs[sample_names_col])
29
+
30
+ site_types = ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C']
20
31
 
21
- site_types = ['GpC_site', 'CpG_site', 'ambiguous_GpC_site', 'ambiguous_CpG_site', 'other_C']
22
- categories = adata.obs[obs_column].cat.categories
23
32
  for site_type in site_types:
24
33
  adata.obs[f'{site_type}_row_methylation_sums'] = pd.Series(0, index=adata.obs_names, dtype=int)
25
34
  adata.obs[f'{site_type}_row_methylation_means'] = pd.Series(np.nan, index=adata.obs_names, dtype=float)
26
35
  adata.obs[f'number_valid_{site_type}_in_read'] = pd.Series(0, index=adata.obs_names, dtype=int)
27
36
  adata.obs[f'fraction_valid_{site_type}_in_range'] = pd.Series(np.nan, index=adata.obs_names, dtype=float)
28
- for cat in categories:
29
- cat_subset = adata[adata.obs[obs_column] == cat].copy()
37
+ for cat in references:
38
+ cat_subset = adata[adata.obs[reference_column] == cat].copy()
30
39
  for site_type in site_types:
31
40
  print(f'Iterating over {cat}_{site_type}')
32
41
  observation_matrix = cat_subset.obsm[f'{cat}_{site_type}']
@@ -42,4 +51,46 @@ def calculate_converted_read_methylation_stats(adata, obs_column='Reference'):
42
51
  adata.obs.update(temp_obs_data)
43
52
  # Indicate whether the read-level GpC methylation rate exceeds the false methylation rate of the read
44
53
  pass_array = np.array(adata.obs[f'GpC_site_row_methylation_means'] > adata.obs[f'other_C_row_methylation_means'])
45
- adata.obs['GpC_above_other_C'] = pd.Series(pass_array, index=adata.obs.index, dtype=bool)
54
+ adata.obs['GpC_above_other_C'] = pd.Series(pass_array, index=adata.obs.index, dtype=bool)
55
+
56
+ adata.uns['methylation_dict'] = {}
57
+ n_bins = 50
58
+ site_types_to_analyze = ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C']
59
+
60
+ for reference in references:
61
+ reference_adata = adata[adata.obs[reference_column] == reference].copy()
62
+ split_reference = reference.split('_')[0][1:]
63
+ for sample in sample_names:
64
+ sample_adata = reference_adata[reference_adata.obs[sample_names_col] == sample].copy()
65
+ for site_type in site_types_to_analyze:
66
+ methylation_data = sample_adata.obs[f'{site_type}_row_methylation_means']
67
+ max_meth = np.max(sample_adata.obs[f'{site_type}_row_methylation_sums'])
68
+ if not np.isnan(max_meth):
69
+ n_bins = int(max_meth // 2)
70
+ else:
71
+ n_bins = 1
72
+ mean = np.mean(methylation_data)
73
+ median = np.median(methylation_data)
74
+ stdev = np.std(methylation_data)
75
+ adata.uns['methylation_dict'][f'{reference}_{sample}_{site_type}'] = [mean, median, stdev]
76
+ if show_methylation_histogram or save_methylation_histogram:
77
+ fig, ax = plt.subplots(figsize=(6, 4))
78
+ count, bins, patches = plt.hist(methylation_data, bins=n_bins, weights=np.ones(len(methylation_data)) / len(methylation_data), alpha=0.7, color='blue', edgecolor='black')
79
+ plt.axvline(median, color='red', linestyle='dashed', linewidth=1)
80
+ plt.text(median + stdev, max(count)*0.8, f'Median: {median:.2f}', color='red')
81
+ plt.axvline(median - stdev, color='green', linestyle='dashed', linewidth=1, label=f'Stdev: {stdev:.2f}')
82
+ plt.axvline(median + stdev, color='green', linestyle='dashed', linewidth=1)
83
+ plt.text(median + stdev + 0.05, max(count) / 3, f'+1 Stdev: {stdev:.2f}', color='green')
84
+ plt.xlabel('Fraction methylated')
85
+ plt.ylabel('Proportion')
86
+ title = f'Distribution of {methylation_data.shape[0]} read {site_type} methylation means \nfor {sample} sample on {split_reference} after filtering'
87
+ plt.title(title, pad=20)
88
+ plt.xlim(-0.05, 1.05) # Set x-axis range from 0 to 1
89
+ ax.spines['right'].set_visible(False)
90
+ ax.spines['top'].set_visible(False)
91
+ save_name = output_directory + f'/{readwrite.date_string()} {title}'
92
+ if save_methylation_histogram:
93
+ plt.savefig(save_name, bbox_inches='tight', pad_inches=0.1)
94
+ plt.close()
95
+ else:
96
+ plt.show()
@@ -7,7 +7,7 @@ def calculate_coverage(adata, obs_column='Reference', position_nan_threshold=0.0
7
7
  Parameters:
8
8
  adata (AnnData): An AnnData object
9
9
  obs_column (str): Observation column value to subset on prior to calculating position statistics for that category.
10
- position_nan_threshold (float): A minimal threshold of coverage to call the position as valid.
10
+ position_nan_threshold (float): A minimal fractional threshold of coverage within the obs_column category to call the position as valid.
11
11
 
12
12
  Returns:
13
13
  None
@@ -21,7 +21,7 @@ def calculate_coverage(adata, obs_column='Reference', position_nan_threshold=0.0
21
21
  # Loop over categories
22
22
  for cat in categories:
23
23
  # Look at positional information for each reference
24
- temp_cat_adata = adata[adata.obs[obs_column] == cat]
24
+ temp_cat_adata = adata[adata.obs[obs_column] == cat].copy()
25
25
  # Look at read coverage on the given category strand
26
26
  cat_valid_coverage = np.sum(~np.isnan(temp_cat_adata.X), axis=0)
27
27
  cat_invalid_coverage = np.sum(np.isnan(temp_cat_adata.X), axis=0)
@@ -13,7 +13,7 @@ def calculate_pairwise_hamming_distances(arrays):
13
13
 
14
14
  """
15
15
  import numpy as np
16
- import tqdm
16
+ from tqdm import tqdm
17
17
  from scipy.spatial.distance import hamming
18
18
  num_arrays = len(arrays)
19
19
  # Initialize an empty distance matrix
@@ -1,12 +1,17 @@
1
1
  ## calculate_read_length_stats
2
2
 
3
3
  # Read length QC
4
- def calculate_read_length_stats(adata):
4
+ def calculate_read_length_stats(adata, reference_column, sample_names_col, output_directory, show_read_length_histogram=False, save_read_length_histogram=False):
5
5
  """
6
6
  Append first valid position in a read and last valid position in the read. From this determine and append the read length.
7
7
 
8
8
  Parameters:
9
9
  adata (AnnData): An adata object
10
+ reference_column (str): String representing the name of the Reference column to use
11
+ sample_names_col (str): String representing the name of the sample name column to use
12
+ output_directory (str): String representing the output directory to make and write out the histograms.
13
+ show_read_length_histogram (bool): Whether to display the histograms.
14
+ save_read_length_histogram (bool): Whether to save the histograms.
10
15
 
11
16
  Returns:
12
17
  upper_bound (int): last valid position in the dataset
@@ -15,8 +20,17 @@ def calculate_read_length_stats(adata):
15
20
  import numpy as np
16
21
  import anndata as ad
17
22
  import pandas as pd
18
- ## Add basic observation-level (read-level) metadata to the object: first valid position in a read and last valid position in the read. From this determine the read length. Save two new variable which hold the first and last valid positions in the entire dataset
23
+ import matplotlib.pyplot as plt
24
+ from .. import readwrite
25
+ from .make_dirs import make_dirs
26
+
27
+ make_dirs([output_directory])
19
28
 
29
+ references = set(adata.obs[reference_column])
30
+ sample_names = set(adata.obs[sample_names_col])
31
+
32
+ ## Add basic observation-level (read-level) metadata to the object: first valid position in a read and last valid position in the read. From this determine the read length. Save two new variable which hold the first and last valid positions in the entire dataset
33
+ print('calculating read length stats')
20
34
  # Add some basic observation-level (read-level) metadata to the anndata object
21
35
  read_first_valid_position = np.array([int(adata.var_names[i]) for i in np.argmax(~np.isnan(adata.X), axis=1)])
22
36
  read_last_valid_position = np.array([int(adata.var_names[i]) for i in (adata.X.shape[1] - 1 - np.argmax(~np.isnan(adata.X[:, ::-1]), axis=1))])
@@ -29,4 +43,44 @@ def calculate_read_length_stats(adata):
29
43
  # Define variables to hold the first and last valid position in the dataset
30
44
  upper_bound = int(np.nanmax(adata.obs['last_valid_position']))
31
45
  lower_bound = int(np.nanmin(adata.obs['first_valid_position']))
46
+
47
+ # Add an unstructured element to the anndata object which points to a dictionary of read lengths keyed by reference and sample name. Points to a tuple containing (mean, median, stdev) of the read lengths of the sample for the given reference strand
48
+
49
+ ## Plot histogram of read length data and save the median and stdev of the read lengths for each sample.
50
+ adata.uns['read_length_dict'] = {}
51
+
52
+ for reference in references:
53
+ temp_reference_adata = adata[adata.obs[reference_column] == reference].copy()
54
+ split_reference = reference.split('_')[0][1:]
55
+ for sample in sample_names:
56
+ temp_sample_adata = temp_reference_adata[temp_reference_adata.obs[sample_names_col] == sample].copy()
57
+ temp_data = temp_sample_adata.obs['read_length']
58
+ max_length = np.max(temp_data)
59
+ mean = np.mean(temp_data)
60
+ median = np.median(temp_data)
61
+ stdev = np.std(temp_data)
62
+ adata.uns['read_length_dict'][f'{reference}_{sample}'] = [mean, median, stdev]
63
+ if not np.isnan(max_length):
64
+ n_bins = int(max_length // 100)
65
+ else:
66
+ n_bins = 1
67
+ if show_read_length_histogram or save_read_length_histogram:
68
+ plt.figure(figsize=(10, 6))
69
+ plt.text(median + 0.5, max(plt.hist(temp_data, bins=n_bins)[0]) / 2, f'Median: {median:.2f}', color='red')
70
+ plt.hist(temp_data, bins=n_bins, alpha=0.7, color='blue', edgecolor='black')
71
+ plt.xlabel('Read Length')
72
+ plt.ylabel('Count')
73
+ title = f'Read length distribution of {temp_sample_adata.shape[0]} total reads from {sample} sample on {split_reference} allele'
74
+ plt.title(title)
75
+ # Add a vertical line at the median
76
+ plt.axvline(median, color='red', linestyle='dashed', linewidth=1)
77
+ # Annotate the median
78
+ plt.xlim(lower_bound - 100, upper_bound + 100)
79
+ if save_read_length_histogram:
80
+ save_name = output_directory + f'/{readwrite.date_string()} {title}'
81
+ plt.savefig(save_name, bbox_inches='tight', pad_inches=0.1)
82
+ plt.close()
83
+ else:
84
+ plt.show()
85
+
32
86
  return upper_bound, lower_bound
@@ -1,7 +1,5 @@
1
1
  ## clean_NaN
2
- from ..readwrite import adata_to_df
3
2
 
4
- # NaN handling
5
3
  def clean_NaN(adata, layer=None):
6
4
  """
7
5
  Append layers to adata that contain NaN cleaning strategies.
@@ -16,6 +14,8 @@ def clean_NaN(adata, layer=None):
16
14
  import numpy as np
17
15
  import anndata as ad
18
16
  import pandas as pd
17
+ from ..readwrite import adata_to_df
18
+
19
19
  # Fill NaN with closest SMF value
20
20
  df = adata_to_df(adata, layer=layer)
21
21
  df = df.ffill(axis=1).bfill(axis=1)
@@ -11,7 +11,7 @@ def filter_converted_reads_on_methylation(adata, valid_SMF_site_threshold=0.8, m
11
11
  valid_SMF_site_threshold (float): A minimum proportion of valid SMF sites that must be present in the read. Default is 0.8
12
12
  min_SMF_threshold (float): A minimum read methylation level. Default is 0.025
13
13
  Returns:
14
- None
14
+ adata (AnnData): The filtered adata object.
15
15
  """
16
16
  import numpy as np
17
17
  import anndata as ad
@@ -24,4 +24,6 @@ def filter_converted_reads_on_methylation(adata, valid_SMF_site_threshold=0.8, m
24
24
  # Keep reads with SMF methylation over background methylation.
25
25
  adata = adata[adata.obs['GpC_above_other_C'] == True].copy()
26
26
  # Keep reads over a defined methylation threshold
27
- adata = adata[adata.obs['GpC_site_row_methylation_means'] > min_SMF_threshold].copy()
27
+ adata = adata[adata.obs['GpC_site_row_methylation_means'] > min_SMF_threshold].copy()
28
+
29
+ return adata
@@ -10,7 +10,7 @@ def filter_reads_on_length(adata, filter_on_coordinates=False, min_read_length=2
10
10
  min_read_length (int): The minimum read length to keep in the filtered dataset. Default is 2700.
11
11
 
12
12
  Returns:
13
- None
13
+ adata (AnnData): The filtered adata object
14
14
  Input: Adata object. a list of lower and upper bound (set to False or None if not wanted), and a minimum read length integer.
15
15
 
16
16
  """
@@ -36,4 +36,6 @@ def filter_reads_on_length(adata, filter_on_coordinates=False, min_read_length=2
36
36
 
37
37
  if min_read_length:
38
38
  print(f'Subsetting adata to keep reads longer than {min_read_length}')
39
- adata = adata[adata.obs['read_length'] > min_read_length].copy()
39
+ adata = adata[adata.obs['read_length'] > min_read_length].copy()
40
+
41
+ return adata
@@ -13,6 +13,7 @@ def invert_adata(adata):
13
13
  """
14
14
  import numpy as np
15
15
  import anndata as ad
16
+ print('Inverting adata')
16
17
  # Reassign var_names with new names
17
18
  old_var_names = adata.var_names.astype(int).to_numpy()
18
19
  new_var_names = np.sort(old_var_names)[::-1].astype(str)
@@ -0,0 +1,24 @@
1
+ # load_sample_sheet
2
+
3
+ def load_sample_sheet(adata, sample_sheet_path, mapping_key_column):
4
+ """
5
+ Loads a sample sheet csv and uses one of the columns to map sample information into the AnnData object.
6
+
7
+ Parameters:
8
+ adata (AnnData): The Anndata object to append sample information to.
9
+ sample_sheet_path (str):
10
+ mapping_key_column (str):
11
+
12
+ Returns:
13
+ None
14
+ """
15
+ import pandas as pd
16
+ import anndata as ad
17
+ df = pd.read_csv(sample_sheet_path)
18
+ key_column = mapping_key_column
19
+ df[key_column] = df[key_column].astype(str)
20
+ value_columns = [column for column in df.columns if column != key_column]
21
+ mapping_dict = df.set_index(key_column)[value_columns].to_dict(orient='index')
22
+ for column in value_columns:
23
+ column_map = {key: value[column] for key, value in mapping_dict.items()}
24
+ adata.obs[column] = adata.obs[key_column].map(column_map)
@@ -0,0 +1,21 @@
1
+ ## make_dirs
2
+
3
+ # General
4
+ def make_dirs(directories):
5
+ """
6
+ Takes a list of file paths and makes new directories if the directory does not already exist.
7
+
8
+ Parameters:
9
+ directories (list): A list of directories to make
10
+
11
+ Returns:
12
+ None
13
+ """
14
+ import os
15
+
16
+ for directory in directories:
17
+ if not os.path.isdir(directory):
18
+ os.mkdir(directory)
19
+ print(f"Directory '{directory}' created successfully.")
20
+ else:
21
+ print(f"Directory '{directory}' already exists.")
@@ -1,6 +1,6 @@
1
1
  ## mark_duplicates
2
2
 
3
- def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_names'):
3
+ def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_names', hamming_distance_thresholds={}):
4
4
  """
5
5
  Marks duplicates in the adata object.
6
6
 
@@ -9,6 +9,7 @@ def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_na
9
9
  layers (list): A list of strings representing the layers to use.
10
10
  obs_column (str): A string representing the obs column name to first subset on. Default is 'Reference'.
11
11
  sample_col (str):L A string representing the obs column name to second subset on. Default is 'Sample_names'.
12
+ hamming_distance_thresholds (dict): A dictionary keyed by obs_column categories that points to a float corresponding to the distance threshold to apply. Default is an empty dict.
12
13
 
13
14
  Returns:
14
15
  None
@@ -48,22 +49,32 @@ def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_na
48
49
  distance_df = pd.DataFrame(distance_matrix, index=read_names, columns=read_names)
49
50
  # Save the distance dataframe into an unstructured component of the adata object
50
51
  adata.uns[f'Pairwise_Hamming_distance_within_{cat}_{sample}'] = distance_df
51
- # Calculate the minimum non-self distance for every read in the reference and sample
52
- min_distance_values = min_non_diagonal(distance_matrix)
53
- min_distance_df = pd.DataFrame({'Nearest_neighbor_Hamming_distance': min_distance_values}, index=read_names)
54
- adata.obs.update(min_distance_df)
55
- # Generate a histogram of minimum non-self distances for each read
56
- min_distance_bins = plt.hist(min_distance_values, bins=n_reads//4)
57
- # Normalize the max value in any histogram bin to 1
58
- normalized_min_distance_counts = min_distance_bins[0] / np.max(min_distance_bins[0])
59
- # Extract the bin index of peak centers in the histogram
60
- peak_centers, _ = find_peaks(normalized_min_distance_counts, prominence=0.2, distance=5)
61
- first_peak_index = peak_centers[0]
62
- offset_index = first_peak_index-1
63
- # Use the distance corresponding to the first peak as the threshold distance in graph construction
64
- first_peak_distance = min_distance_bins[1][first_peak_index]
65
- offset_distance = min_distance_bins[1][offset_index]
66
- adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = offset_distance
52
+ if n_reads > 1:
53
+ # Calculate the minimum non-self distance for every read in the reference and sample
54
+ min_distance_values = min_non_diagonal(distance_matrix)
55
+ min_distance_df = pd.DataFrame({'Nearest_neighbor_Hamming_distance': min_distance_values}, index=read_names)
56
+ adata.obs.update(min_distance_df)
57
+ # Generate a histogram of minimum non-self distances for each read
58
+ if n_reads > 3:
59
+ n_bins = n_reads // 4
60
+ else:
61
+ n_bins = 1
62
+ min_distance_bins = plt.hist(min_distance_values, bins=n_bins)
63
+ if cat in hamming_distance_thresholds:
64
+ adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = hamming_distance_thresholds[cat]
65
+ else: # eventually this should be written to use known PCR duplicate controls for thresholding.
66
+ # Normalize the max value in any histogram bin to 1
67
+ normalized_min_distance_counts = min_distance_bins[0] / np.max(min_distance_bins[0])
68
+ # Extract the bin index of peak centers in the histogram
69
+ peak_centers, _ = find_peaks(normalized_min_distance_counts, prominence=0.2, distance=5)
70
+ first_peak_index = peak_centers[0]
71
+ offset_index = first_peak_index-1
72
+ # Use the distance corresponding to the first peak as the threshold distance in graph construction
73
+ first_peak_distance = min_distance_bins[1][first_peak_index]
74
+ offset_distance = min_distance_bins[1][offset_index]
75
+ adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = offset_distance
76
+ else:
77
+ adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = 0
67
78
 
68
79
  ## Detect likely duplicate reads and mark them in the adata object.
69
80
  adata.obs['Marked_duplicate'] = pd.Series(False, index=adata.obs_names, dtype=bool)
@@ -91,7 +102,11 @@ def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_na
91
102
  clusters = [list(cluster) for cluster in clusters]
92
103
  # Get the number of clusters
93
104
  cluster_count = len(clusters)
94
- adata.uns[f'Hamming_distance_clusters_within_{cat}_{sample}'] = [cluster_count, n_reads, cluster_count / n_reads, clusters]
105
+ if n_reads > 0:
106
+ fraction_unique = cluster_count / n_reads
107
+ else:
108
+ fraction_unique = 0
109
+ adata.uns[f'Hamming_distance_clusters_within_{cat}_{sample}'] = [cluster_count, n_reads, fraction_unique, clusters]
95
110
  # Update the adata object
96
111
  read_cluster_map = {}
97
112
  read_duplicate_map = {}
@@ -116,4 +131,4 @@ def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_na
116
131
  adata.obs.update(df_combined)
117
132
  adata.obs['Marked_duplicate'] = adata.obs['Marked_duplicate'].astype(bool)
118
133
  adata.obs['Unique_in_final_read_set'] = adata.obs['Unique_in_final_read_set'].astype(bool)
119
- print(f'Hamming clusters for {sample} on {cat}\nThreshold: {first_peak_distance}\nNumber clusters: {cluster_count}\nNumber reads: {n_reads}\nFraction unique: {cluster_count / n_reads}')
134
+ print(f'Hamming clusters for {sample} on {cat}\nThreshold: {distance_threshold}\nNumber clusters: {cluster_count}\nNumber reads: {n_reads}\nFraction unique: {fraction_unique}')
@@ -0,0 +1,125 @@
1
+ # recipes
2
+
3
+ def recipe_1_Kissiov_and_McKenna_2025(adata, sample_sheet_path, output_directory, mapping_key_column='Sample', reference_column = 'Reference', sample_names_col='Sample_names', invert=False):
4
+ """
5
+ The first part of the preprocessing workflow applied to the smf.inform.pod_to_adata() output derived from Kissiov_and_McKenna_2025.
6
+
7
+ Performs the following tasks:
8
+ 1) Loads a sample CSV to append metadata mappings to the adata object.
9
+ 2) Appends a boolean indicating whether each position in var_names is within a given reference.
10
+ 3) Appends the cytosine context to each position from each reference.
11
+ 4) Calculate read level methylation statistics.
12
+ 5) Optionally inverts the adata to flip the position coordinate orientation.
13
+ 6) Calculates read length statistics (start position, end position, read length)
14
+ 7) Returns a dictionary to pass the variable namespace to the parent scope.
15
+
16
+ Parameters:
17
+ adata (AnnData): The AnnData object to use as input.
18
+ sample_sheet_path (str): String representing the path to the sample sheet csv containing the sample metadata.
19
+ output_directory (str): String representing the path to the output directory for plots.
20
+ mapping_key_column (str): The column name to use as the mapping keys for applying the sample sheet metadata.
21
+ reference_column (str): The name of the reference column to use.
22
+ sample_names_col (str): The name of the sample name column to use.
23
+ invert (bool): Whether to invert the positional coordinates of the adata object.
24
+
25
+ Returns:
26
+ variables (dict): A dictionary of variables to append to the parent scope.
27
+ """
28
+ import anndata as ad
29
+ import pandas as pd
30
+ import numpy as np
31
+ from .load_sample_sheet import load_sample_sheet
32
+ from .calculate_coverage import calculate_coverage
33
+ from .append_C_context import append_C_context
34
+ from .calculate_converted_read_methylation_stats import calculate_converted_read_methylation_stats
35
+ from .invert_adata import invert_adata
36
+ from .calculate_read_length_stats import calculate_read_length_stats
37
+
38
+ # Clean up some of the Reference metadata and save variable names that point to sets of values in the column.
39
+ adata.obs[reference_column] = adata.obs[reference_column].astype('category')
40
+ references = adata.obs[reference_column].cat.categories
41
+ split_references = [(reference, reference.split('_')[0][1:]) for reference in references]
42
+ reference_mapping = {k: v for k, v in split_references}
43
+ adata.obs[f'{reference_column}_short'] = adata.obs[reference_column].map(reference_mapping)
44
+ short_references = set(adata.obs[f'{reference_column}_short'])
45
+ binary_layers = adata.layers.keys()
46
+
47
+ # load sample sheet metadata
48
+ load_sample_sheet(adata, sample_sheet_path, mapping_key_column)
49
+
50
+ # hold sample names set
51
+ adata.obs[sample_names_col] = adata.obs[sample_names_col].astype('category')
52
+ sample_names = adata.obs[sample_names_col].cat.categories
53
+
54
+ # Add position level metadata
55
+ calculate_coverage(adata, obs_column=reference_column)
56
+ adata.var['SNP_position'] = (adata.var[f'N_{reference_column}_with_position'] > 0) & (adata.var[f'N_{reference_column}_with_position'] < len(references)).astype(bool)
57
+
58
+ # Append cytosine context to the reference positions based on the conversion strand.
59
+ append_C_context(adata, obs_column=reference_column, use_consensus=False)
60
+
61
+ # Calculate read level methylation statistics. Assess if GpC methylation level is above other_C methylation level as a QC.
62
+ calculate_converted_read_methylation_stats(adata, reference_column, sample_names_col, output_directory, show_methylation_histogram=False, save_methylation_histogram=False)
63
+
64
+ # Invert the adata object (ie flip the strand orientation for visualization)
65
+ if invert:
66
+ invert_adata(adata)
67
+ else:
68
+ pass
69
+
70
+ # Calculate read length statistics, with options to display or save the read length histograms
71
+ upper_bound, lower_bound = calculate_read_length_stats(adata, reference_column, sample_names_col, output_directory, show_read_length_histogram=False, save_read_length_histogram=False)
72
+
73
+ variables = {
74
+ "short_references": short_references,
75
+ "binary_layers": binary_layers,
76
+ "sample_names": sample_names,
77
+ "upper_bound": upper_bound,
78
+ "lower_bound": lower_bound,
79
+ "references": references
80
+ }
81
+ return variables
82
+
83
+ def recipe_2_Kissiov_and_McKenna_2025(adata, output_directory, binary_layers, hamming_distance_thresholds={}, reference_column = 'Reference', sample_names_col='Sample_names'):
84
+ """
85
+ The second part of the preprocessing workflow applied to the adata that has already been preprocessed by recipe_1_Kissiov_and_McKenna_2025.
86
+
87
+ Performs the following tasks:
88
+ 1) Adds new layers containing NaN replaced variants of adata.X (fill_closest, nan0_0minus1, nan1_12).
89
+ 2) Marks putative PCR duplicates using pairwise hamming distance metrics.
90
+ 3) Performs a complexity analysis of the library based on the PCR duplicate detection rate.
91
+ 4) Removes PCR duplicates from the adata.
92
+ 5) Returns two adata object: one for the filtered adata and one for the duplicate adata.
93
+
94
+ Parameters:
95
+ adata (AnnData): The AnnData object to use as input.
96
+ output_directory (str): String representing the path to the output directory for plots.
97
+ binary_layers (list): A list of layers to used for the binary encoding of read sequences. Used for duplicate detection.
98
+ hamming_distance_thresholds (dict): A dictionary keyed by obs_column categories that points to a float corresponding to the distance threshold to apply. Default is an empty dict.
99
+ reference_column (str): The name of the reference column to use.
100
+ sample_names_col (str): The name of the sample name column to use.
101
+
102
+ Returns:
103
+ filtered_adata (AnnData): An AnnData object containing the filtered reads
104
+ duplicates (AnnData): An AnnData object containing the duplicate reads
105
+ """
106
+ import anndata as ad
107
+ import pandas as pd
108
+ import numpy as np
109
+ from .clean_NaN import clean_NaN
110
+ from .mark_duplicates import mark_duplicates
111
+ from .calculate_complexity import calculate_complexity
112
+ from .remove_duplicates import remove_duplicates
113
+
114
+ # NaN replacement strategies stored in additional layers. Having layer=None uses adata.X
115
+ clean_NaN(adata, layer=None)
116
+
117
+ # Duplicate detection using pairwise hamming distance across reads
118
+ mark_duplicates(adata, binary_layers, obs_column=reference_column, sample_col=sample_names_col, hamming_distance_thresholds=hamming_distance_thresholds)
119
+
120
+ # Complexity analysis using the marked duplicates and the lander-watermann algorithm
121
+ calculate_complexity(adata, output_directory, obs_column=reference_column, sample_col=sample_names_col, plot=True, save_plot=False)
122
+
123
+ # Remove duplicate reads and store the duplicate reads in a new AnnData object named duplicates.
124
+ filtered_adata, duplicates = remove_duplicates(adata)
125
+ return filtered_adata, duplicates
@@ -8,11 +8,14 @@ def remove_duplicates(adata):
8
8
  adata (Anndata): An adata object.
9
9
 
10
10
  Returns:
11
- None
11
+ filtered_adata (AnnData): An AnnData object of the filtered reads
12
+ duplicates (AnnData): An AnnData object of the duplicate reads
12
13
  """
13
14
  import anndata as ad
14
15
 
15
16
  initial_size = adata.shape[0]
16
- adata = adata[adata.obs['Unique_in_final_read_set'] == True].copy()
17
- final_size = adata.shape[0]
18
- print(f'Removed {initial_size-final_size} reads from the dataset')
17
+ filtered_adata = adata[adata.obs['Unique_in_final_read_set'] == True].copy()
18
+ final_size = filtered_adata.shape[0]
19
+ print(f'Removed {initial_size-final_size} reads from the dataset')
20
+ duplicates = adata[adata.obs['Unique_in_final_read_set'] == False].copy()
21
+ return filtered_adata, duplicates
@@ -0,0 +1 @@
1
+ # apply_HMM
File without changes
@@ -0,0 +1 @@
1
+ # read_HMM
@@ -0,0 +1,32 @@
1
+ # subset_adata
2
+
3
+ def subset_adata(adata, obs_columns):
4
+ """
5
+ Subsets an AnnData object based on categorical values in specified `.obs` columns.
6
+
7
+ Parameters:
8
+ adata (AnnData): The AnnData object to subset.
9
+ obs_columns (list of str): List of `.obs` column names to subset by. The order matters.
10
+
11
+ Returns:
12
+ dict: A dictionary where keys are tuples of category values and values are corresponding AnnData subsets.
13
+ """
14
+
15
+ def subset_recursive(adata_subset, columns):
16
+ if not columns:
17
+ return {(): adata_subset}
18
+
19
+ current_column = columns[0]
20
+ categories = adata_subset.obs[current_column].cat.categories
21
+
22
+ subsets = {}
23
+ for cat in categories:
24
+ subset = adata_subset[adata_subset.obs[current_column] == cat]
25
+ subsets.update(subset_recursive(subset, columns[1:]))
26
+
27
+ return subsets
28
+
29
+ # Start the recursive subset process
30
+ subsets_dict = subset_recursive(adata, obs_columns)
31
+
32
+ return subsets_dict
@@ -0,0 +1,43 @@
1
+ # train_HMM
2
+
3
+ def train_HMM(adata, model_name='trained_HMM', save_hmm=False):
4
+ """
5
+
6
+ Parameters:
7
+ adata (AnnData): Input AnnData object
8
+ model_name (str): Name of the model
9
+ save_hmm (bool): Whether to save the model
10
+
11
+ """
12
+ import numpy as np
13
+ import anndata as ad
14
+ from pomegranate.distributions import Categorical
15
+ from pomegranate.hmm import DenseHMM
16
+
17
+ bound = Categorical([[0.95, 0.05]])
18
+ unbound = Categorical([[0.05, 0.95]])
19
+
20
+ edges = [[0.9, 0.1], [0.1, 0.9]]
21
+ starts = [0.5, 0.5]
22
+ ends = [0.5, 0.5]
23
+
24
+ model = DenseHMM([bound, unbound], edges=edges, starts=starts, ends=ends, max_iter=5, verbose=True)
25
+
26
+ # define training sets and labels
27
+ # Determine the number of reads to sample
28
+ n_sample = round(0.7 * adata.X.shape[0])
29
+ # Generate random indices
30
+ np.random.seed(0)
31
+ random_indices = np.random.choice(adata.shape[0], size=n_sample, replace=False)
32
+ # Subset the AnnData object using the random indices
33
+ training_adata_subsampled = adata[random_indices, :]
34
+ training_sequences = training_adata_subsampled.X
35
+
36
+ # Train the HMM without labeled data
37
+ model.fit(training_sequences, algorithm='baum-welch')
38
+
39
+ if save_hmm:
40
+ # Save the model to a file
41
+ model_json = model.to_json()
42
+ with open(f'{model_name}.json', 'w') as f:
43
+ f.write(model_json)