smftools 0.1.3__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. smftools/__init__.py +5 -1
  2. smftools/_version.py +1 -1
  3. smftools/informatics/__init__.py +2 -0
  4. smftools/informatics/archived/print_bam_query_seq.py +29 -0
  5. smftools/informatics/basecall_pod5s.py +80 -0
  6. smftools/informatics/conversion_smf.py +63 -10
  7. smftools/informatics/direct_smf.py +66 -18
  8. smftools/informatics/helpers/LoadExperimentConfig.py +1 -0
  9. smftools/informatics/helpers/__init__.py +16 -2
  10. smftools/informatics/helpers/align_and_sort_BAM.py +27 -16
  11. smftools/informatics/helpers/aligned_BAM_to_bed.py +49 -48
  12. smftools/informatics/helpers/bam_qc.py +66 -0
  13. smftools/informatics/helpers/binarize_converted_base_identities.py +69 -21
  14. smftools/informatics/helpers/canoncall.py +12 -3
  15. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +5 -4
  16. smftools/informatics/helpers/converted_BAM_to_adata.py +34 -22
  17. smftools/informatics/helpers/converted_BAM_to_adata_II.py +369 -0
  18. smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
  19. smftools/informatics/helpers/extract_base_identities.py +33 -46
  20. smftools/informatics/helpers/extract_mods.py +55 -23
  21. smftools/informatics/helpers/extract_read_features_from_bam.py +31 -0
  22. smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
  23. smftools/informatics/helpers/find_conversion_sites.py +33 -44
  24. smftools/informatics/helpers/generate_converted_FASTA.py +87 -86
  25. smftools/informatics/helpers/modcall.py +13 -5
  26. smftools/informatics/helpers/modkit_extract_to_adata.py +762 -396
  27. smftools/informatics/helpers/ohe_batching.py +65 -41
  28. smftools/informatics/helpers/ohe_layers_decode.py +32 -0
  29. smftools/informatics/helpers/one_hot_decode.py +27 -0
  30. smftools/informatics/helpers/one_hot_encode.py +45 -9
  31. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +1 -0
  32. smftools/informatics/helpers/run_multiqc.py +28 -0
  33. smftools/informatics/helpers/split_and_index_BAM.py +3 -8
  34. smftools/informatics/load_adata.py +58 -3
  35. smftools/plotting/__init__.py +15 -0
  36. smftools/plotting/classifiers.py +355 -0
  37. smftools/plotting/general_plotting.py +205 -0
  38. smftools/plotting/position_stats.py +462 -0
  39. smftools/preprocessing/__init__.py +6 -7
  40. smftools/preprocessing/append_C_context.py +22 -9
  41. smftools/preprocessing/{mark_duplicates.py → archives/mark_duplicates.py} +38 -26
  42. smftools/preprocessing/binarize_on_Youden.py +35 -32
  43. smftools/preprocessing/binary_layers_to_ohe.py +13 -3
  44. smftools/preprocessing/calculate_complexity.py +3 -2
  45. smftools/preprocessing/calculate_converted_read_methylation_stats.py +44 -46
  46. smftools/preprocessing/calculate_coverage.py +26 -25
  47. smftools/preprocessing/calculate_pairwise_differences.py +49 -0
  48. smftools/preprocessing/calculate_position_Youden.py +18 -7
  49. smftools/preprocessing/calculate_read_length_stats.py +39 -46
  50. smftools/preprocessing/clean_NaN.py +33 -25
  51. smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
  52. smftools/preprocessing/filter_converted_reads_on_methylation.py +20 -5
  53. smftools/preprocessing/filter_reads_on_length.py +14 -4
  54. smftools/preprocessing/flag_duplicate_reads.py +149 -0
  55. smftools/preprocessing/invert_adata.py +18 -11
  56. smftools/preprocessing/load_sample_sheet.py +30 -16
  57. smftools/preprocessing/recipes.py +22 -20
  58. smftools/preprocessing/subsample_adata.py +58 -0
  59. smftools/readwrite.py +105 -13
  60. smftools/tools/__init__.py +49 -0
  61. smftools/tools/apply_hmm.py +202 -0
  62. smftools/tools/apply_hmm_batched.py +241 -0
  63. smftools/tools/archived/classify_methylated_features.py +66 -0
  64. smftools/tools/archived/classify_non_methylated_features.py +75 -0
  65. smftools/tools/archived/subset_adata_v1.py +32 -0
  66. smftools/tools/archived/subset_adata_v2.py +46 -0
  67. smftools/tools/calculate_distances.py +18 -0
  68. smftools/tools/calculate_umap.py +62 -0
  69. smftools/tools/call_hmm_peaks.py +105 -0
  70. smftools/tools/classifiers.py +787 -0
  71. smftools/tools/cluster_adata_on_methylation.py +105 -0
  72. smftools/tools/data/__init__.py +2 -0
  73. smftools/tools/data/anndata_data_module.py +90 -0
  74. smftools/tools/data/preprocessing.py +6 -0
  75. smftools/tools/display_hmm.py +18 -0
  76. smftools/tools/general_tools.py +69 -0
  77. smftools/tools/hmm_readwrite.py +16 -0
  78. smftools/tools/inference/__init__.py +1 -0
  79. smftools/tools/inference/lightning_inference.py +41 -0
  80. smftools/tools/models/__init__.py +9 -0
  81. smftools/tools/models/base.py +14 -0
  82. smftools/tools/models/cnn.py +34 -0
  83. smftools/tools/models/lightning_base.py +41 -0
  84. smftools/tools/models/mlp.py +17 -0
  85. smftools/tools/models/positional.py +17 -0
  86. smftools/tools/models/rnn.py +16 -0
  87. smftools/tools/models/sklearn_models.py +40 -0
  88. smftools/tools/models/transformer.py +133 -0
  89. smftools/tools/models/wrappers.py +20 -0
  90. smftools/tools/nucleosome_hmm_refinement.py +104 -0
  91. smftools/tools/position_stats.py +239 -0
  92. smftools/tools/read_stats.py +70 -0
  93. smftools/tools/subset_adata.py +19 -23
  94. smftools/tools/train_hmm.py +78 -0
  95. smftools/tools/training/__init__.py +1 -0
  96. smftools/tools/training/train_lightning_model.py +47 -0
  97. smftools/tools/utils/__init__.py +2 -0
  98. smftools/tools/utils/device.py +10 -0
  99. smftools/tools/utils/grl.py +14 -0
  100. {smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/METADATA +47 -11
  101. smftools-0.1.7.dist-info/RECORD +136 -0
  102. smftools/tools/apply_HMM.py +0 -1
  103. smftools/tools/read_HMM.py +0 -1
  104. smftools/tools/train_HMM.py +0 -43
  105. smftools-0.1.3.dist-info/RECORD +0 -84
  106. /smftools/preprocessing/{remove_duplicates.py → archives/remove_duplicates.py} +0 -0
  107. /smftools/tools/{cluster.py → evaluation/__init__.py} +0 -0
  108. {smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/WHEEL +0 -0
  109. {smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,6 @@
1
1
  ## mark_duplicates
2
2
 
3
- def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_names', hamming_distance_thresholds={}):
3
+ def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_names', method='N_masked_distances', distance_thresholds={}):
4
4
  """
5
5
  Marks duplicates in the adata object.
6
6
 
@@ -8,8 +8,9 @@ def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_na
8
8
  adata (AnnData): An adata object.
9
9
  layers (list): A list of strings representing the layers to use.
10
10
  obs_column (str): A string representing the obs column name to first subset on. Default is 'Reference'.
11
- sample_col (str):L A string representing the obs column name to second subset on. Default is 'Sample_names'.
12
- hamming_distance_thresholds (dict): A dictionary keyed by obs_column categories that points to a float corresponding to the distance threshold to apply. Default is an empty dict.
11
+ sample_col (str): A string representing the obs column name to second subset on. Default is 'Sample_names'.
12
+ method (str): method to use for calculating the distance metric
13
+ distance_thresholds (dict): A dictionary keyed by obs_column categories that points to a float corresponding to the distance threshold to apply. Default is an empty dict.
13
14
 
14
15
  Returns:
15
16
  None
@@ -21,7 +22,7 @@ def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_na
21
22
  from scipy.signal import find_peaks
22
23
  import networkx as nx
23
24
  from .binary_layers_to_ohe import binary_layers_to_ohe
24
- from .calculate_pairwise_hamming_distances import calculate_pairwise_hamming_distances
25
+ from .calculate_pairwise_differences import calculate_pairwise_differences
25
26
  from .min_non_diagonal import min_non_diagonal
26
27
 
27
28
  categories = adata.obs[obs_column].cat.categories
@@ -29,49 +30,59 @@ def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_na
29
30
 
30
31
  # Calculate the pairwise Hamming distances within each reference/sample set. Determine distance thresholds for each reference/sample pair
31
32
  adata.obs['Nearest_neighbor_Hamming_distance'] = pd.Series(np.nan, index=adata.obs_names, dtype=float)
33
+ cat_sample_dict = {}
32
34
  for cat in categories:
33
35
  cat_subset = adata[adata.obs[obs_column] == cat].copy()
34
36
  for sample in sample_names:
35
37
  sample_subset = cat_subset[cat_subset.obs[sample_col] == sample].copy()
38
+ sample_subset = sample_subset[:, sample_subset.var[f'{cat}_any_C_site'] == True].copy() # only uses C sites from the converted strand
36
39
  # Encode sequencing reads as a one-hot-encodings
37
- adata.uns[f'{cat}_{sample}_read_OHE_dict'] = binary_layers_to_ohe(sample_subset, layers, stack='hstack')
40
+ print(f'One-hot encoding reads from {sample} on {cat}')
41
+ cat_sample_dict[f'{cat}_{sample}_read_OHE_dict'] = binary_layers_to_ohe(sample_subset, layers, stack='hstack')
38
42
  # Unpack the read names and one hot encodings into lists
39
43
  read_names = []
40
44
  ohe_list = []
41
- for read_name, ohe in adata.uns[f'{cat}_{sample}_read_OHE_dict'].items():
45
+ for read_name, ohe in cat_sample_dict[f'{cat}_{sample}_read_OHE_dict'].items():
42
46
  read_names.append(read_name)
43
47
  ohe_list.append(ohe)
44
48
  # Calculate the pairwise hamming distances
45
- print(f'Calculating hamming distances for {sample} on {cat} allele')
46
- distance_matrix = calculate_pairwise_hamming_distances(ohe_list)
49
+ if method == 'N_masked_distances':
50
+ print(f'Calculating N_masked_distances for {sample} on {cat} allele')
51
+ distance_matrix = calculate_pairwise_differences(ohe_list)
52
+ else:
53
+ print(f'{method} for calculating differences is not available')
47
54
  n_reads = distance_matrix.shape[0]
48
55
  # Load the hamming matrix into a dataframe with index and column names as the read_names
49
56
  distance_df = pd.DataFrame(distance_matrix, index=read_names, columns=read_names)
50
- # Save the distance dataframe into an unstructured component of the adata object
51
- adata.uns[f'Pairwise_Hamming_distance_within_{cat}_{sample}'] = distance_df
57
+ cat_sample_dict[f'Pairwise_Hamming_distance_within_{cat}_{sample}'] = distance_df
58
+
52
59
  if n_reads > 1:
53
60
  # Calculate the minimum non-self distance for every read in the reference and sample
54
61
  min_distance_values = min_non_diagonal(distance_matrix)
55
62
  min_distance_df = pd.DataFrame({'Nearest_neighbor_Hamming_distance': min_distance_values}, index=read_names)
56
63
  adata.obs.update(min_distance_df)
57
- # Generate a histogram of minimum non-self distances for each read
58
- if n_reads > 3:
59
- n_bins = n_reads // 4
60
- else:
61
- n_bins = 1
62
- min_distance_bins = plt.hist(min_distance_values, bins=n_bins)
63
- if cat in hamming_distance_thresholds:
64
- adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = hamming_distance_thresholds[cat]
64
+
65
+ if cat in distance_thresholds:
66
+ adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = distance_thresholds[cat]
65
67
  else: # eventually this should be written to use known PCR duplicate controls for thresholding.
68
+ # Generate a histogram of minimum non-self distances for each read
69
+ if n_reads > 3:
70
+ n_bins = n_reads // 4
71
+ else:
72
+ n_bins = 1
73
+ min_distance_bins = plt.hist(min_distance_values, bins=n_bins)
66
74
  # Normalize the max value in any histogram bin to 1
67
75
  normalized_min_distance_counts = min_distance_bins[0] / np.max(min_distance_bins[0])
68
76
  # Extract the bin index of peak centers in the histogram
69
- peak_centers, _ = find_peaks(normalized_min_distance_counts, prominence=0.2, distance=5)
70
- first_peak_index = peak_centers[0]
71
- offset_index = first_peak_index-1
72
- # Use the distance corresponding to the first peak as the threshold distance in graph construction
73
- first_peak_distance = min_distance_bins[1][first_peak_index]
74
- offset_distance = min_distance_bins[1][offset_index]
77
+ try:
78
+ peak_centers, _ = find_peaks(normalized_min_distance_counts, prominence=0.2, distance=5)
79
+ first_peak_index = peak_centers[0]
80
+ offset_index = first_peak_index-1
81
+ # Use the distance corresponding to the first peak as the threshold distance in graph construction
82
+ first_peak_distance = min_distance_bins[1][first_peak_index]
83
+ offset_distance = min_distance_bins[1][offset_index]
84
+ except:
85
+ offset_distance = normalized_min_distance_counts[0]
75
86
  adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = offset_distance
76
87
  else:
77
88
  adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = 0
@@ -83,7 +94,7 @@ def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_na
83
94
 
84
95
  for cat in categories:
85
96
  for sample in sample_names:
86
- distance_df = adata.uns[f'Pairwise_Hamming_distance_within_{cat}_{sample}']
97
+ distance_df = cat_sample_dict[f'Pairwise_Hamming_distance_within_{cat}_{sample}']
87
98
  read_names = distance_df.index
88
99
  distance_matrix = distance_df.values
89
100
  n_reads = distance_matrix.shape[0]
@@ -106,7 +117,8 @@ def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_na
106
117
  fraction_unique = cluster_count / n_reads
107
118
  else:
108
119
  fraction_unique = 0
109
- adata.uns[f'Hamming_distance_clusters_within_{cat}_{sample}'] = [cluster_count, n_reads, fraction_unique, clusters]
120
+ adata.uns[f'Hamming_distance_cluster_count_within_{cat}_{sample}'] = cluster_count
121
+ adata.uns[f'total_reads_within_{cat}_{sample}'] = n_reads
110
122
  # Update the adata object
111
123
  read_cluster_map = {}
112
124
  read_duplicate_map = {}
@@ -1,42 +1,45 @@
1
- ## binarize_on_Youden
2
-
3
1
  def binarize_on_Youden(adata, obs_column='Reference'):
4
2
  """
5
- Add a new layer to the adata object that has binarized SMF values based on the position thresholds determined by calculate_position_Youden
3
+ Binarize SMF values based on position thresholds determined by calculate_position_Youden.
6
4
 
7
5
  Parameters:
8
- adata (AnnData): The anndata object to binarize. pp.calculate_position_Youden function has to be run first.
9
- obs_column (str): The obs_column to stratify on. Needs to be the same as passed in pp.calculate_position_Youden.
10
- Input: adata object that has had calculate_position_Youden called on it.
11
- Output:
6
+ adata (AnnData): The anndata object to binarize. `calculate_position_Youden` must have been run first.
7
+ obs_column (str): The obs column to stratify on. Needs to match what was passed in `calculate_position_Youden`.
8
+
9
+ Modifies:
10
+ Adds a new layer to `adata.layers['binarized_methylation']` containing the binarized methylation matrix.
12
11
  """
13
12
  import numpy as np
14
- import anndata as ad
15
- temp_adata = None
16
- categories = adata.obs[obs_column].cat.categories
13
+ import anndata as ad
14
+
15
+ # Initialize an empty matrix to store the binarized methylation values
16
+ binarized_methylation = np.full_like(adata.X, np.nan, dtype=float) # Keeps same shape as adata.X
17
+
18
+ # Get unique categories
19
+ categories = adata.obs[obs_column].cat.categories
20
+
17
21
  for cat in categories:
18
- # Get the category subset
19
- cat_subset = adata[adata.obs[obs_column] == cat].copy()
20
- # extract the probability matrix for the category subset
21
- original_matrix = cat_subset.X
22
- # extract the learned methylation call thresholds for each position in the category.
23
- thresholds = [cat_subset.var[f'{cat}_position_methylation_thresholding_Youden_stats'][i][0] for i in range(cat_subset.shape[1])]
24
- # In the original matrix, get all positions that are nan values
22
+ # Select subset for this category
23
+ cat_mask = adata.obs[obs_column] == cat
24
+ cat_subset = adata[cat_mask]
25
+
26
+ # Extract the probability matrix
27
+ original_matrix = cat_subset.X.copy()
28
+
29
+ # Extract the thresholds for each position efficiently
30
+ thresholds = np.array(cat_subset.var[f'{cat}_position_methylation_thresholding_Youden_stats'].apply(lambda x: x[0]))
31
+
32
+ # Identify NaN values
25
33
  nan_mask = np.isnan(original_matrix)
26
- # Binarize the matrix on the new thresholds
34
+
35
+ # Binarize based on threshold
27
36
  binarized_matrix = (original_matrix > thresholds).astype(float)
28
- # At the original positions that had nan values, replace the values with nans again
37
+
38
+ # Restore NaN values
29
39
  binarized_matrix[nan_mask] = np.nan
30
- # Make a new layer for the reference that contains the binarized methylation calls
31
- cat_subset.layers['binarized_methylation'] = binarized_matrix
32
- if temp_adata:
33
- # If temp_data already exists, concatenate
34
- temp_adata = ad.concat([temp_adata, cat_subset], join='outer', index_unique=None).copy()
35
- else:
36
- # If temp_adata is still None, initialize temp_adata with reference_subset
37
- temp_adata = cat_subset.copy()
38
-
39
- # Sort the temp adata on the index names of the primary adata
40
- temp_adata = temp_adata[adata.obs_names].copy()
41
- # Pull back the new binarized layers into the original adata object
42
- adata.layers['binarized_methylation'] = temp_adata.layers['binarized_methylation']
40
+
41
+ # Assign the binarized values back into the preallocated storage
42
+ binarized_methylation[cat_mask, :] = binarized_matrix
43
+
44
+ # Store the binarized matrix in a new layer
45
+ adata.layers['binarized_methylation'] = binarized_methylation
@@ -1,11 +1,11 @@
1
1
  ## binary_layers_to_ohe
2
2
 
3
3
  ## Conversion SMF Specific
4
- def binary_layers_to_ohe(adata, layers, stack='hstack'):
4
+ def binary_layers_to_ohe(adata, binary_layers, stack='hstack'):
5
5
  """
6
6
  Parameters:
7
7
  adata (AnnData): Anndata object.
8
- layers (list): a list of strings. Each string represents a layer in the adata object. The layer should encode a binary matrix
8
+ binary_layers (list): a list of strings. Each string represents a layer in the adata object. The layer should encode a binary matrix.
9
9
  stack (str): Dimension to stack the one-hot-encoding. Options include 'hstack' and 'vstack'. Default is 'hstack', since this is more efficient.
10
10
 
11
11
  Returns:
@@ -14,8 +14,18 @@ def binary_layers_to_ohe(adata, layers, stack='hstack'):
14
14
  """
15
15
  import numpy as np
16
16
  import anndata as ad
17
+
18
+ # Ensure that the N layer is last!
19
+ # Grab all binary layers that are not encoding N
20
+ ACGT_binary_layers = [layer for layer in binary_layers if 'binary' in layer and layer != 'N_binary_encoding']
21
+ # If there is a binary layer encoding N, hold it in N_binary_layer
22
+ N_binary_layer = [layer for layer in binary_layers if layer == 'N_binary_encoding']
23
+ # Add the N_binary_encoding layer to the end of the list of binary layers
24
+ all_binary_layers = ACGT_binary_layers + N_binary_layer
25
+ print(f'Found {all_binary_layers} layers in adata')
26
+
17
27
  # Extract the layers
18
- layers = [adata.layers[layer_name] for layer_name in layers]
28
+ layers = [adata.layers[layer_name] for layer_name in all_binary_layers]
19
29
  n_reads = layers[0].shape[0]
20
30
  ohe_dict = {}
21
31
  for i in range(n_reads):
@@ -32,7 +32,8 @@ def calculate_complexity(adata, output_directory='', obs_column='Reference', sam
32
32
 
33
33
  for cat in categories:
34
34
  for sample in sample_names:
35
- unique_reads, total_reads = adata.uns[f'Hamming_distance_clusters_within_{cat}_{sample}'][0:2]
35
+ unique_reads = adata.uns[f'Hamming_distance_cluster_count_within_{cat}_{sample}']
36
+ total_reads = adata.uns[f'total_reads_within_{cat}_{sample}']
36
37
  reads = np.concatenate((np.arange(unique_reads), np.random.choice(unique_reads, total_reads - unique_reads, replace=True)))
37
38
  # Subsampling depths
38
39
  subsampling_depths = [total_reads // (i+1) for i in range(10)]
@@ -49,7 +50,7 @@ def calculate_complexity(adata, output_directory='', obs_column='Reference', sam
49
50
  # Generate data for the complexity curve
50
51
  x_data = np.linspace(0, 5000, 100)
51
52
  y_data = lander_waterman(x_data, *popt)
52
- adata.uns[f'Library_complexity_{sample}_on_{cat}'] = popt[0]
53
+ adata.uns[f'Library_complexity_of_{sample}_on_{cat}'] = popt[0]
53
54
  if plot:
54
55
  import matplotlib.pyplot as plt
55
56
  # Plot the complexity curve
@@ -3,7 +3,7 @@
3
3
  ## Conversion SMF Specific
4
4
  # Read methylation QC
5
5
 
6
- def calculate_converted_read_methylation_stats(adata, reference_column, sample_names_col, output_directory, show_methylation_histogram=False, save_methylation_histogram=False):
6
+ def calculate_converted_read_methylation_stats(adata, reference_column, sample_names_col):
7
7
  """
8
8
  Adds methylation statistics for each read. Indicates whether the read GpC methylation exceeded other_C methylation (background false positives).
9
9
 
@@ -11,9 +11,6 @@ def calculate_converted_read_methylation_stats(adata, reference_column, sample_n
11
11
  adata (AnnData): An adata object
12
12
  reference_column (str): String representing the name of the Reference column to use
13
13
  sample_names_col (str): String representing the name of the sample name column to use
14
- output_directory (str): String representing the output directory to make and write out the histograms.
15
- show_methylation_histogram (bool): Whether to display the histograms.
16
- save_methylation_histogram (bool): Whether to save the histograms.
17
14
 
18
15
  Returns:
19
16
  None
@@ -21,8 +18,8 @@ def calculate_converted_read_methylation_stats(adata, reference_column, sample_n
21
18
  import numpy as np
22
19
  import anndata as ad
23
20
  import pandas as pd
24
- import matplotlib.pyplot as plt
25
- from .. import readwrite
21
+
22
+ print('Calculating read level methylation statistics')
26
23
 
27
24
  references = set(adata.obs[reference_column])
28
25
  sample_names = set(adata.obs[sample_names_col])
@@ -53,44 +50,45 @@ def calculate_converted_read_methylation_stats(adata, reference_column, sample_n
53
50
  pass_array = np.array(adata.obs[f'GpC_site_row_methylation_means'] > adata.obs[f'other_C_row_methylation_means'])
54
51
  adata.obs['GpC_above_other_C'] = pd.Series(pass_array, index=adata.obs.index, dtype=bool)
55
52
 
56
- adata.uns['methylation_dict'] = {}
57
- n_bins = 50
58
- site_types_to_analyze = ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C']
53
+ # Below should be a plotting function
54
+ # adata.uns['methylation_dict'] = {}
55
+ # n_bins = 50
56
+ # site_types_to_analyze = ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C']
59
57
 
60
- for reference in references:
61
- reference_adata = adata[adata.obs[reference_column] == reference].copy()
62
- split_reference = reference.split('_')[0][1:]
63
- for sample in sample_names:
64
- sample_adata = reference_adata[reference_adata.obs[sample_names_col] == sample].copy()
65
- for site_type in site_types_to_analyze:
66
- methylation_data = sample_adata.obs[f'{site_type}_row_methylation_means']
67
- max_meth = np.max(sample_adata.obs[f'{site_type}_row_methylation_sums'])
68
- if not np.isnan(max_meth):
69
- n_bins = int(max_meth // 2)
70
- else:
71
- n_bins = 1
72
- mean = np.mean(methylation_data)
73
- median = np.median(methylation_data)
74
- stdev = np.std(methylation_data)
75
- adata.uns['methylation_dict'][f'{reference}_{sample}_{site_type}'] = [mean, median, stdev]
76
- if show_methylation_histogram or save_methylation_histogram:
77
- fig, ax = plt.subplots(figsize=(6, 4))
78
- count, bins, patches = plt.hist(methylation_data, bins=n_bins, weights=np.ones(len(methylation_data)) / len(methylation_data), alpha=0.7, color='blue', edgecolor='black')
79
- plt.axvline(median, color='red', linestyle='dashed', linewidth=1)
80
- plt.text(median + stdev, max(count)*0.8, f'Median: {median:.2f}', color='red')
81
- plt.axvline(median - stdev, color='green', linestyle='dashed', linewidth=1, label=f'Stdev: {stdev:.2f}')
82
- plt.axvline(median + stdev, color='green', linestyle='dashed', linewidth=1)
83
- plt.text(median + stdev + 0.05, max(count) / 3, f'+1 Stdev: {stdev:.2f}', color='green')
84
- plt.xlabel('Fraction methylated')
85
- plt.ylabel('Proportion')
86
- title = f'Distribution of {methylation_data.shape[0]} read {site_type} methylation means \nfor {sample} sample on {split_reference} after filtering'
87
- plt.title(title, pad=20)
88
- plt.xlim(-0.05, 1.05) # Set x-axis range from 0 to 1
89
- ax.spines['right'].set_visible(False)
90
- ax.spines['top'].set_visible(False)
91
- save_name = output_directory + f'/{readwrite.date_string()} {title}'
92
- if save_methylation_histogram:
93
- plt.savefig(save_name, bbox_inches='tight', pad_inches=0.1)
94
- plt.close()
95
- else:
96
- plt.show()
58
+ # for reference in references:
59
+ # reference_adata = adata[adata.obs[reference_column] == reference].copy()
60
+ # split_reference = reference.split('_')[0][1:]
61
+ # for sample in sample_names:
62
+ # sample_adata = reference_adata[reference_adata.obs[sample_names_col] == sample].copy()
63
+ # for site_type in site_types_to_analyze:
64
+ # methylation_data = sample_adata.obs[f'{site_type}_row_methylation_means']
65
+ # max_meth = np.max(sample_adata.obs[f'{site_type}_row_methylation_sums'])
66
+ # if not np.isnan(max_meth):
67
+ # n_bins = int(max_meth // 2)
68
+ # else:
69
+ # n_bins = 1
70
+ # mean = np.mean(methylation_data)
71
+ # median = np.median(methylation_data)
72
+ # stdev = np.std(methylation_data)
73
+ # adata.uns['methylation_dict'][f'{reference}_{sample}_{site_type}'] = [mean, median, stdev]
74
+ # if show_methylation_histogram or save_methylation_histogram:
75
+ # fig, ax = plt.subplots(figsize=(6, 4))
76
+ # count, bins, patches = plt.hist(methylation_data, bins=n_bins, weights=np.ones(len(methylation_data)) / len(methylation_data), alpha=0.7, color='blue', edgecolor='black')
77
+ # plt.axvline(median, color='red', linestyle='dashed', linewidth=1)
78
+ # plt.text(median + stdev, max(count)*0.8, f'Median: {median:.2f}', color='red')
79
+ # plt.axvline(median - stdev, color='green', linestyle='dashed', linewidth=1, label=f'Stdev: {stdev:.2f}')
80
+ # plt.axvline(median + stdev, color='green', linestyle='dashed', linewidth=1)
81
+ # plt.text(median + stdev + 0.05, max(count) / 3, f'+1 Stdev: {stdev:.2f}', color='green')
82
+ # plt.xlabel('Fraction methylated')
83
+ # plt.ylabel('Proportion')
84
+ # title = f'Distribution of {methylation_data.shape[0]} read {site_type} methylation means \nfor {sample} sample on {split_reference} after filtering'
85
+ # plt.title(title, pad=20)
86
+ # plt.xlim(-0.05, 1.05) # Set x-axis range from 0 to 1
87
+ # ax.spines['right'].set_visible(False)
88
+ # ax.spines['top'].set_visible(False)
89
+ # save_name = output_directory + f'/{readwrite.date_string()} {title}'
90
+ # if save_methylation_histogram:
91
+ # plt.savefig(save_name, bbox_inches='tight', pad_inches=0.1)
92
+ # plt.close()
93
+ # else:
94
+ # plt.show()
@@ -1,41 +1,42 @@
1
- ## calculate_coverage
2
-
3
- def calculate_coverage(adata, obs_column='Reference', position_nan_threshold=0.05):
1
+ def calculate_coverage(adata, obs_column='Reference_strand', position_nan_threshold=0.05):
4
2
  """
5
- Append position level metadata regarding whether the position is informative within the given observation category.
3
+ Append position-level metadata regarding whether the position is informative within the given observation category.
6
4
 
7
5
  Parameters:
8
6
  adata (AnnData): An AnnData object
9
7
  obs_column (str): Observation column value to subset on prior to calculating position statistics for that category.
10
8
  position_nan_threshold (float): A minimal fractional threshold of coverage within the obs_column category to call the position as valid.
11
9
 
12
- Returns:
13
- None
10
+ Modifies:
11
+ - Adds new columns to `adata.var` containing coverage statistics.
14
12
  """
15
13
  import numpy as np
16
- import anndata as ad
17
14
  import pandas as pd
18
-
15
+ import anndata as ad
16
+
19
17
  categories = adata.obs[obs_column].cat.categories
20
18
  n_categories_with_position = np.zeros(adata.shape[1])
19
+
21
20
  # Loop over categories
22
21
  for cat in categories:
23
- # Look at positional information for each reference
24
- temp_cat_adata = adata[adata.obs[obs_column] == cat].copy()
25
- # Look at read coverage on the given category strand
22
+ print(f'Assessing positional coverage across samples for {cat} reference')
23
+
24
+ # Subset to current category
25
+ cat_mask = adata.obs[obs_column] == cat
26
+ temp_cat_adata = adata[cat_mask]
27
+
28
+ # Compute fraction of valid coverage
26
29
  cat_valid_coverage = np.sum(~np.isnan(temp_cat_adata.X), axis=0)
27
- cat_invalid_coverage = np.sum(np.isnan(temp_cat_adata.X), axis=0)
28
- cat_valid_fraction = cat_valid_coverage / (cat_valid_coverage + cat_invalid_coverage)
29
- # Append metadata for category to the anndata object
30
+ cat_valid_fraction = cat_valid_coverage / temp_cat_adata.shape[0] # Avoid extra computation
31
+
32
+ # Store coverage stats
30
33
  adata.var[f'{cat}_valid_fraction'] = pd.Series(cat_valid_fraction, index=adata.var.index)
31
- # Characterize if the position is in the given category or not
32
- conditions = [
33
- (adata.var[f'{cat}_valid_fraction'] >= position_nan_threshold),
34
- (adata.var[f'{cat}_valid_fraction'] < position_nan_threshold)
35
- ]
36
- choices = [True, False]
37
- adata.var[f'position_in_{cat}'] = np.select(conditions, choices, default=False)
38
- n_categories_with_position += np.array(adata.var[f'position_in_{cat}'])
39
-
40
- # Final array with the sum at each position of the number of categories covering that position
41
- adata.var[f'N_{obs_column}_with_position'] = n_categories_with_position.astype(int)
34
+
35
+ # Assign whether the position is covered based on threshold
36
+ adata.var[f'position_in_{cat}'] = cat_valid_fraction >= position_nan_threshold
37
+
38
+ # Sum the number of categories covering each position
39
+ n_categories_with_position += adata.var[f'position_in_{cat}'].values
40
+
41
+ # Store final category count
42
+ adata.var[f'N_{obs_column}_with_position'] = n_categories_with_position.astype(int)
@@ -0,0 +1,49 @@
1
+ # calculate_pairwise_differences
2
+
3
+ def calculate_pairwise_differences(arrays):
4
+ """
5
+ Calculate the pairwise differences for a list of h-stacked ndarrays. Ignore N-positions
6
+
7
+ Parameters:
8
+ arrays (str): A list of ndarrays.
9
+
10
+ Returns:
11
+ distance_matrix (ndarray): a 2D array containing the pairwise differences between all arrays.
12
+ """
13
+ import numpy as np
14
+ from tqdm import tqdm
15
+
16
+ num_arrays = len(arrays)
17
+
18
+ n_rows = 5
19
+ reshaped_arrays = [array.reshape(n_rows, -1) for array in arrays]
20
+ N_masks = [array[-1].astype(bool) for array in reshaped_arrays]
21
+ reshaped_arrays_minus_N = [array[:-1].flatten() for array in reshaped_arrays]
22
+
23
+ # Precompute the repeated N masks to avoid repeated computations
24
+ repeated_N_masks = [np.tile(N_mask, (n_rows - 1)) for N_mask in N_masks]
25
+
26
+ # Initialize the distance matrix
27
+ distance_matrix = np.zeros((num_arrays, num_arrays), dtype=np.float32)
28
+
29
+ # Calculate pairwise distances with progress bar
30
+ for i in tqdm(range(num_arrays), desc="Calculating Pairwise Differences"):
31
+ array_i = reshaped_arrays_minus_N[i]
32
+ N_mask_i = repeated_N_masks[i]
33
+
34
+ for j in range(i + 1, num_arrays):
35
+ array_j = reshaped_arrays_minus_N[j]
36
+ N_mask_j = repeated_N_masks[j]
37
+
38
+ # Combined mask to ignore N positions
39
+ combined_mask = N_mask_i | N_mask_j
40
+
41
+ # Calculate the hamming distance directly with boolean operations
42
+ differences = (array_i != array_j) & ~combined_mask
43
+ distance = np.sum(differences) / np.sum(~combined_mask)
44
+
45
+ # Store the symmetric distances
46
+ distance_matrix[i, j] = distance
47
+ distance_matrix[j, i] = distance
48
+
49
+ return distance_matrix
@@ -1,7 +1,7 @@
1
1
  ## calculate_position_Youden
2
2
 
3
3
  ## Calculating and applying position level thresholds for methylation calls to binarize the SMF data
4
- def calculate_position_Youden(adata, positive_control_sample, negative_control_sample, J_threshold=0.4, obs_column='Reference', save=False, output_directory=''):
4
+ def calculate_position_Youden(adata, positive_control_sample='positive', negative_control_sample='negative', J_threshold=0.5, obs_column='Reference', infer_on_percentile=False, inference_variable='', save=False, output_directory=''):
5
5
  """
6
6
  Adds new variable metadata to each position indicating whether the position provides reliable SMF methylation calls. Also outputs plots of the positional ROC curves.
7
7
 
@@ -11,6 +11,8 @@ def calculate_position_Youden(adata, positive_control_sample, negative_control_s
11
11
  negative_control_sample (str): string representing the sample name corresponding to the Minus MTase control sample.
12
12
  J_threshold (float): A float indicating the J-statistic used to indicate whether a position passes QC for methylation calls.
13
13
  obs_column (str): The category to iterate over.
14
+ infer_on_perdentile (bool | int): If False, use defined postive and negative control samples. If an int (0 < int < 100) is passed, this uses the top and bottom int percentile of methylated reads based on metric in inference_variable column.
15
+ inference_variable (str): If infer_on_percentile has an integer value passed, use the AnnData observation column name passed by this string as the metric.
14
16
  save (bool): Whether to save the ROC plots.
15
17
  output_directory (str): String representing the path to the output directory to output the ROC curves.
16
18
 
@@ -27,15 +29,25 @@ def calculate_position_Youden(adata, positive_control_sample, negative_control_s
27
29
  categories = adata.obs[obs_column].cat.categories
28
30
  # Iterate over each category in the specified obs_column
29
31
  for cat in categories:
32
+ print(f"Calculating position Youden statistics for {cat}")
30
33
  # Subset to keep only reads associated with the category
31
- cat_subset = adata[adata.obs[obs_column] == cat].copy()
34
+ cat_subset = adata[adata.obs[obs_column] == cat]
32
35
  # Iterate over positive and negative control samples
33
36
  for control in control_samples:
34
37
  # Initialize a dictionary for the given control sample. This will be keyed by dataset and position to point to a tuple of coordinate position and an array of methylation probabilities
35
38
  adata.uns[f'{cat}_position_methylation_dict_{control}'] = {}
36
- # get the current control subset on the given category
37
- filtered_obs = cat_subset.obs[cat_subset.obs['Sample_names'].str.contains(control, na=False, regex=True)]
38
- control_subset = cat_subset[filtered_obs.index].copy()
39
+ if infer_on_percentile:
40
+ sorted_column = cat_subset.obs[inference_variable].sort_values(ascending=False)
41
+ if control == "positive":
42
+ threshold = np.percentile(sorted_column, 100 - infer_on_percentile)
43
+ control_subset = cat_subset[cat_subset.obs[inference_variable] >= threshold, :]
44
+ else:
45
+ threshold = np.percentile(sorted_column, infer_on_percentile)
46
+ control_subset = cat_subset[cat_subset.obs[inference_variable] <= threshold, :]
47
+ else:
48
+ # get the current control subset on the given category
49
+ filtered_obs = cat_subset.obs[cat_subset.obs['Sample_names'].str.contains(control, na=False, regex=True)]
50
+ control_subset = cat_subset[filtered_obs.index]
39
51
  # Iterate through every position in the control subset
40
52
  for position in range(control_subset.shape[1]):
41
53
  # Get the coordinate name associated with that position
@@ -91,8 +103,7 @@ def calculate_position_Youden(adata, positive_control_sample, negative_control_s
91
103
  probability_thresholding_list[position] = (0.8, np.nan)
92
104
  title = f'ROC Curve for {n_passed_positions} positions with J-stat greater than {J_threshold}\n out of {n_total_positions} total positions on {cat}'
93
105
  plt.title(title)
94
- date_string = date_string()
95
- save_name = output_directory + f'/{date_string} {title}'
106
+ save_name = output_directory + f'/{title}'
96
107
  if save:
97
108
  plt.savefig(save_name)
98
109
  plt.close()