smftools 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. smftools/__init__.py +27 -0
  2. smftools/_settings.py +19 -0
  3. smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
  4. smftools/datasets/__init__.py +9 -0
  5. smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
  6. smftools/datasets/datasets.py +25 -0
  7. smftools/informatics/__init__.py +11 -0
  8. smftools/informatics/helpers/__init__.py +42 -0
  9. smftools/informatics/helpers/align_BAM.py +49 -0
  10. smftools/informatics/helpers/binarize_converted_base_identities.py +24 -0
  11. smftools/informatics/helpers/canoncall.py +12 -0
  12. smftools/informatics/helpers/converted_BAM_to_adata.py +147 -0
  13. smftools/informatics/helpers/count_aligned_reads.py +32 -0
  14. smftools/informatics/helpers/extract_base_identities.py +36 -0
  15. smftools/informatics/helpers/extract_mods.py +39 -0
  16. smftools/informatics/helpers/find_conversion_sites.py +53 -0
  17. smftools/informatics/helpers/generate_converted_FASTA.py +59 -0
  18. smftools/informatics/helpers/get_native_references.py +25 -0
  19. smftools/informatics/helpers/informatics.py +260 -0
  20. smftools/informatics/helpers/load_adata.py +516 -0
  21. smftools/informatics/helpers/load_experiment_config.py +17 -0
  22. smftools/informatics/helpers/make_dirs.py +15 -0
  23. smftools/informatics/helpers/make_modbed.py +21 -0
  24. smftools/informatics/helpers/modQC.py +19 -0
  25. smftools/informatics/helpers/modcall.py +14 -0
  26. smftools/informatics/helpers/modkit_extract_to_adata.py +355 -0
  27. smftools/informatics/helpers/one_hot_encode.py +14 -0
  28. smftools/informatics/helpers/separate_bam_by_bc.py +28 -0
  29. smftools/informatics/helpers/split_and_index_BAM.py +21 -0
  30. smftools/informatics/pod5_conversion.py +26 -0
  31. smftools/informatics/pod5_direct.py +29 -0
  32. smftools/informatics/pod5_to_adata.py +17 -0
  33. smftools/informatics/readwrite.py +109 -0
  34. smftools/plotting/__init__.py +0 -0
  35. smftools/preprocessing/__init__.py +35 -0
  36. smftools/preprocessing/append_C_context.py +39 -0
  37. smftools/preprocessing/binarize_on_Youden.py +38 -0
  38. smftools/preprocessing/binary_layers_to_ohe.py +25 -0
  39. smftools/preprocessing/calculate_complexity.py +59 -0
  40. smftools/preprocessing/calculate_converted_read_methylation_stats.py +38 -0
  41. smftools/preprocessing/calculate_coverage.py +35 -0
  42. smftools/preprocessing/calculate_pairwise_hamming_distances.py +22 -0
  43. smftools/preprocessing/calculate_position_Youden.py +95 -0
  44. smftools/preprocessing/calculate_read_length_stats.py +27 -0
  45. smftools/preprocessing/clean_NaN.py +31 -0
  46. smftools/preprocessing/filter_converted_reads_on_methylation.py +20 -0
  47. smftools/preprocessing/filter_reads_on_length.py +31 -0
  48. smftools/preprocessing/invert_adata.py +18 -0
  49. smftools/preprocessing/mark_duplicates.py +110 -0
  50. smftools/preprocessing/min_non_diagonal.py +20 -0
  51. smftools/preprocessing/preprocessing.py +614 -0
  52. smftools/preprocessing/remove_duplicates.py +12 -0
  53. smftools/readwrite.py +109 -0
  54. smftools/tools/__init__.py +0 -0
  55. smftools-0.1.0.dist-info/METADATA +75 -0
  56. smftools-0.1.0.dist-info/RECORD +58 -0
  57. smftools-0.1.0.dist-info/WHEEL +4 -0
  58. smftools-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,39 @@
1
+ ## append_C_context
2
+ import numpy as np
3
+ import anndata as ad
4
+ import pandas as pd
5
+
6
+ ## Conversion SMF Specific
7
+ # Read methylation QC
8
+ def append_C_context(adata, obs_column='Reference', use_consensus=False):
9
+ """
10
+ Input: An adata object, the obs_column of interst, and whether to use the consensus sequence from the category.
11
+ Output: Adds Cytosine context to the position within the given category. When use_consensus is True, it uses the consensus sequence, otherwise it defaults to the FASTA sequence.
12
+ """
13
+ site_types = ['GpC_site', 'CpG_site', 'ambiguous_GpC_site', 'ambiguous_CpG_site', 'other_C']
14
+ categories = adata.obs[obs_column].cat.categories
15
+ if use_consensus:
16
+ sequence = adata.uns[f'{cat}_consensus_sequence']
17
+ else:
18
+ sequence = adata.uns[f'{cat}_FASTA_sequence']
19
+ for cat in categories:
20
+ boolean_dict = {}
21
+ for site_type in site_types:
22
+ boolean_dict[f'{cat}_{site_type}'] = np.full(len(sequence), False, dtype=bool)
23
+ # Iterate through the sequence and apply the criteria
24
+ for i in range(1, len(sequence) - 1):
25
+ if sequence[i] == 'C':
26
+ if sequence[i - 1] == 'G' and sequence[i + 1] != 'G':
27
+ boolean_dict[f'{cat}_GpC_site'][i] = True
28
+ elif sequence[i - 1] == 'G' and sequence[i + 1] == 'G':
29
+ boolean_dict[f'{cat}_ambiguous_GpC_site'][i] = True
30
+ elif sequence[i - 1] != 'G' and sequence[i + 1] == 'G':
31
+ boolean_dict[f'{cat}_CpG_site'][i] = True
32
+ elif sequence[i - 1] == 'G' and sequence[i + 1] == 'G':
33
+ boolean_dict[f'{cat}_ambiguous_CpG_site'][i] = True
34
+ elif sequence[i - 1] != 'G' and sequence[i + 1] != 'G':
35
+ boolean_dict[f'{cat}_other_C'][i] = True
36
+ for site_type in site_types:
37
+ adata.var[f'{cat}_{site_type}'] = boolean_dict[f'{cat}_{site_type}'].astype(bool)
38
+ adata.obsm[f'{cat}_{site_type}'] = adata[:, adata.var[f'{cat}_{site_type}'] == True].copy().X
39
+
@@ -0,0 +1,38 @@
1
+ ## binarize_on_Youden
2
+ import numpy as np
3
+ import pandas as pd
4
+ import anndata as ad
5
+
6
+ def binarize_on_Youden(adata, obs_column='Reference'):
7
+ """
8
+ Input: adata object that has had calculate_position_Youden called on it.
9
+ Output: Add a new layer to the adata object that has binarized SMF values based on the position thresholds determined by calculate_position_Youden
10
+ """
11
+ temp_adata = None
12
+ categories = adata.obs[obs_column].cat.categories
13
+ for cat in categories:
14
+ # Get the category subset
15
+ cat_subset = adata[adata.obs[obs_column] == cat].copy()
16
+ # extract the probability matrix for the category subset
17
+ original_matrix = cat_subset.X
18
+ # extract the learned methylation call thresholds for each position in the category.
19
+ thresholds = [cat_subset.var[f'{cat}_position_methylation_thresholding_Youden_stats'][i][0] for i in range(cat_subset.shape[1])]
20
+ # In the original matrix, get all positions that are nan values
21
+ nan_mask = np.isnan(original_matrix)
22
+ # Binarize the matrix on the new thresholds
23
+ binarized_matrix = (original_matrix > thresholds).astype(float)
24
+ # At the original positions that had nan values, replace the values with nans again
25
+ binarized_matrix[nan_mask] = np.nan
26
+ # Make a new layer for the reference that contains the binarized methylation calls
27
+ cat_subset.layers['binarized_methylation'] = binarized_matrix
28
+ if temp_adata:
29
+ # If temp_data already exists, concatenate
30
+ temp_adata = ad.concat([temp_adata, cat_subset], join='outer', index_unique=None).copy()
31
+ else:
32
+ # If temp_adata is still None, initialize temp_adata with reference_subset
33
+ temp_adata = cat_subset.copy()
34
+
35
+ # Sort the temp adata on the index names of the primary adata
36
+ temp_adata = temp_adata[adata.obs_names].copy()
37
+ # Pull back the new binarized layers into the original adata object
38
+ adata.layers['binarized_methylation'] = temp_adata.layers['binarized_methylation']
@@ -0,0 +1,25 @@
1
+ ## binary_layers_to_ohe
2
+ import numpy as np
3
+ import anndata as ad
4
+ import pandas as pd
5
+
6
+ ## Conversion SMF Specific
7
+ def binary_layers_to_ohe(adata, layers, stack='hstack'):
8
+ """
9
+ Input: An adata object and a list of layers containing a binary encoding.
10
+ Output: A dictionary keyed by obs_name that points to a stacked (hstack or vstack) one-hot encoding of the binary layers
11
+ """
12
+ # Extract the layers
13
+ layers = [adata.layers[layer_name] for layer_name in layers]
14
+ n_reads = layers[0].shape[0]
15
+ ohe_dict = {}
16
+ for i in range(n_reads):
17
+ read_ohe = []
18
+ for layer in layers:
19
+ read_ohe.append(layer[i])
20
+ read_name = adata.obs_names[i]
21
+ if stack == 'hstack':
22
+ ohe_dict[read_name] = np.hstack(read_ohe)
23
+ elif stack == 'vstack':
24
+ ohe_dict[read_name] = np.vstack(read_ohe)
25
+ return ohe_dict
@@ -0,0 +1,59 @@
1
+ ## calculate_complexity
2
+ import numpy as np
3
+ import pandas as pd
4
+ from scipy.optimize import curve_fit
5
+ import matplotlib.pyplot as plt
6
+
7
+ def lander_waterman(x, C0):
8
+ return C0 * (1 - np.exp(-x / C0))
9
+
10
+ def count_unique_reads(reads, depth):
11
+ subsample = np.random.choice(reads, depth, replace=False)
12
+ return len(np.unique(subsample))
13
+
14
+ def calculate_complexity(adata, obs_column='Reference', sample_col='Sample_names', plot=True, save_plot=False):
15
+ """
16
+ Input: adata object with mark_duplicates already run.
17
+ Output: A complexity analysis of the library
18
+ """
19
+ categories = adata.obs[obs_column].cat.categories
20
+ sample_names = adata.obs[sample_col].cat.categories
21
+
22
+ for cat in categories:
23
+ for sample in sample_names:
24
+ unique_reads, total_reads = adata.uns[f'Hamming_distance_clusters_within_{cat}_{sample}'][0:2]
25
+ reads = np.concatenate((np.arange(unique_reads), np.random.choice(unique_reads, total_reads - unique_reads, replace=True)))
26
+ # Subsampling depths
27
+ subsampling_depths = [total_reads // (i+1) for i in range(10)]
28
+ # Arrays to store results
29
+ subsampled_total_reads = []
30
+ subsampled_unique_reads = []
31
+ # Perform subsampling
32
+ for depth in subsampling_depths:
33
+ unique_count = count_unique_reads(reads, depth)
34
+ subsampled_total_reads.append(depth)
35
+ subsampled_unique_reads.append(unique_count)
36
+ # Fit the Lander-Waterman model to the data
37
+ popt, _ = curve_fit(lander_waterman, subsampled_total_reads, subsampled_unique_reads)
38
+ # Generate data for the complexity curve
39
+ x_data = np.linspace(0, 5000, 100)
40
+ y_data = lander_waterman(x_data, *popt)
41
+ adata.uns[f'Library_complexity_{sample}_on_{cat}'] = popt[0]
42
+ if plot:
43
+ # Plot the complexity curve
44
+ plt.figure(figsize=(6, 4))
45
+ plt.plot(total_reads, unique_reads, 'o', label='Observed unique reads')
46
+ plt.plot(x_data, y_data, '-', label=f'Lander-Waterman fit\nEstimated C0 = {popt[0]:.2f}')
47
+ plt.xlabel('Total number of reads')
48
+ plt.ylabel('Number of unique reads')
49
+ title = f'Library Complexity Analysis for {sample} on {cat}'
50
+ plt.title(title)
51
+ plt.legend()
52
+ plt.grid(True)
53
+ if save_plot:
54
+ date_string = date_string()
55
+ save_name = output_directory + f'/{date_string} {title}'
56
+ plt.savefig(save_name, bbox_inches='tight', pad_inches=0.1)
57
+ plt.close()
58
+ else:
59
+ plt.show()
@@ -0,0 +1,38 @@
1
+ ## calculate_converted_read_methylation_stats
2
+ import numpy as np
3
+ import anndata as ad
4
+ import pandas as pd
5
+
6
+ ## Conversion SMF Specific
7
+ # Read methylation QC
8
+
9
+ def calculate_converted_read_methylation_stats(adata, obs_column='Reference'):
10
+ """
11
+ Input: adata and the observation category of interest
12
+ Output: Adds methylation statistics for each read. Indicates whether the read GpC methylation exceeded other_C methylation (background false positives)
13
+ """
14
+ site_types = ['GpC_site', 'CpG_site', 'ambiguous_GpC_site', 'ambiguous_CpG_site', 'other_C']
15
+ categories = adata.obs[obs_column].cat.categories
16
+ for site_type in site_types:
17
+ adata.obs[f'{site_type}_row_methylation_sums'] = pd.Series(0, index=adata.obs_names, dtype=int)
18
+ adata.obs[f'{site_type}_row_methylation_means'] = pd.Series(np.nan, index=adata.obs_names, dtype=float)
19
+ adata.obs[f'number_valid_{site_type}_in_read'] = pd.Series(0, index=adata.obs_names, dtype=int)
20
+ adata.obs[f'fraction_valid_{site_type}_in_range'] = pd.Series(np.nan, index=adata.obs_names, dtype=float)
21
+ for cat in categories:
22
+ cat_subset = adata[adata.obs[obs_column] == cat].copy()
23
+ for site_type in site_types:
24
+ print(f'Iterating over {cat}_{site_type}')
25
+ observation_matrix = cat_subset.obsm[f'{cat}_{site_type}']
26
+ number_valid_positions_in_read = np.nansum(~np.isnan(observation_matrix), axis=1)
27
+ row_methylation_sums = np.nansum(observation_matrix, axis=1)
28
+ number_valid_positions_in_read[number_valid_positions_in_read == 0] = 1
29
+ fraction_valid_positions_in_range = number_valid_positions_in_read / np.max(number_valid_positions_in_read)
30
+ row_methylation_means = np.divide(row_methylation_sums, number_valid_positions_in_read)
31
+ temp_obs_data = pd.DataFrame({f'number_valid_{site_type}_in_read': number_valid_positions_in_read,
32
+ f'fraction_valid_{site_type}_in_range': fraction_valid_positions_in_range,
33
+ f'{site_type}_row_methylation_sums': row_methylation_sums,
34
+ f'{site_type}_row_methylation_means': row_methylation_means}, index=cat_subset.obs.index)
35
+ adata.obs.update(temp_obs_data)
36
+ # Indicate whether the read-level GpC methylation rate exceeds the false methylation rate of the read
37
+ pass_array = np.array(adata.obs[f'GpC_site_row_methylation_means'] > adata.obs[f'other_C_row_methylation_means'])
38
+ adata.obs['GpC_above_other_C'] = pd.Series(pass_array, index=adata.obs.index, dtype=bool)
@@ -0,0 +1,35 @@
1
+ ## calculate_coverage
2
+ from .. import readwrite
3
+ import numpy as np
4
+ import anndata as ad
5
+ import pandas as pd
6
+
7
+
8
+ def calculate_coverage(adata, obs_column='Reference', position_nan_threshold=0.05):
9
+ """
10
+ Input: An adata object and an observation column of interest. Assess if the position is present in the dataset category.
11
+ Output: Append position level metadata indicating whether the position is informative within the given observation category.
12
+ """
13
+ categories = adata.obs[obs_column].cat.categories
14
+ n_categories_with_position = np.zeros(adata.shape[1])
15
+ # Loop over categories
16
+ for cat in categories:
17
+ # Look at positional information for each reference
18
+ temp_cat_adata = adata[adata.obs[obs_column] == cat]
19
+ # Look at read coverage on the given category strand
20
+ cat_valid_coverage = np.sum(~np.isnan(temp_cat_adata.X), axis=0)
21
+ cat_invalid_coverage = np.sum(np.isnan(temp_cat_adata.X), axis=0)
22
+ cat_valid_fraction = cat_valid_coverage / (cat_valid_coverage + cat_invalid_coverage)
23
+ # Append metadata for category to the anndata object
24
+ adata.var[f'{cat}_valid_fraction'] = pd.Series(cat_valid_fraction, index=adata.var.index)
25
+ # Characterize if the position is in the given category or not
26
+ conditions = [
27
+ (adata.var[f'{cat}_valid_fraction'] >= position_nan_threshold),
28
+ (adata.var[f'{cat}_valid_fraction'] < position_nan_threshold)
29
+ ]
30
+ choices = [True, False]
31
+ adata.var[f'position_in_{cat}'] = np.select(conditions, choices, default=False)
32
+ n_categories_with_position += np.array(adata.var[f'position_in_{cat}'])
33
+
34
+ # Final array with the sum at each position of the number of categories covering that position
35
+ adata.var[f'N_{obs_column}_with_position'] = n_categories_with_position.astype(int)
@@ -0,0 +1,22 @@
1
+ ## calculate_pairwise_hamming_distances
2
+ import numpy as np
3
+ import tqdm
4
+ from scipy.spatial.distance import hamming
5
+
6
+ ## Conversion SMF Specific
7
+ def calculate_pairwise_hamming_distances(arrays):
8
+ """
9
+ Calculate the pairwise Hamming distances for a list of ndarrays.
10
+ Input: A list of ndarrays
11
+ Output: a 2D array containing the pairwise Hamming distances.
12
+ """
13
+ num_arrays = len(arrays)
14
+ # Initialize an empty distance matrix
15
+ distance_matrix = np.zeros((num_arrays, num_arrays))
16
+ # Calculate pairwise distances with progress bar
17
+ for i in tqdm(range(num_arrays), desc="Calculating Hamming Distances"):
18
+ for j in range(i + 1, num_arrays):
19
+ distance = hamming(arrays[i], arrays[j])
20
+ distance_matrix[i, j] = distance
21
+ distance_matrix[j, i] = distance
22
+ return distance_matrix
@@ -0,0 +1,95 @@
1
+ ## calculate_position_Youden
2
+ import numpy as np
3
+ import pandas as pd
4
+ import anndata as ad
5
+ import matplotlib.pyplot as plt
6
+ from sklearn.metrics import roc_curve, roc_auc_score
7
+
8
+
9
+
10
+ ## Calculating and applying position level thresholds for methylation calls to binarize the SMF data
11
+ def calculate_position_Youden(adata, positive_control_sample, negative_control_sample, J_threshold=0.4, obs_column='Reference', save=False, output_directory=''):
12
+ """
13
+ Input: An adata object, a plus MTase control, a minus MTase control, the minimal J-statistic threshold, and a categorical observation column to iterate over.
14
+ Input notes: The control samples are passed as string names of the samples as they appear in the 'Sample_names' obs column
15
+ Output: Adds new variable metadata to each position indicating whether the position provides reliable SMF methylation calls. Also outputs plots of the positional ROC curves.
16
+ Can optionally save the output plots of the ROC curve
17
+ """
18
+ control_samples = [positive_control_sample, negative_control_sample]
19
+ categories = adata.obs[obs_column].cat.categories
20
+ # Iterate over each category in the specified obs_column
21
+ for cat in categories:
22
+ # Subset to keep only reads associated with the category
23
+ cat_subset = adata[adata.obs[obs_column] == cat].copy()
24
+ # Iterate over positive and negative control samples
25
+ for control in control_samples:
26
+ # Initialize a dictionary for the given control sample. This will be keyed by dataset and position to point to a tuple of coordinate position and an array of methylation probabilities
27
+ adata.uns[f'{cat}_position_methylation_dict_{control}'] = {}
28
+ # get the current control subset on the given category
29
+ filtered_obs = cat_subset.obs[cat_subset.obs['Sample_names'].str.contains(control, na=False, regex=True)]
30
+ control_subset = cat_subset[filtered_obs.index].copy()
31
+ # Iterate through every position in the control subset
32
+ for position in range(control_subset.shape[1]):
33
+ # Get the coordinate name associated with that position
34
+ coordinate = control_subset.var_names[position]
35
+ # Get the array of methlyation probabilities for each read in the subset at that position
36
+ position_data = control_subset.X[:, position]
37
+ # Get the indexes of everywhere that is not a nan value
38
+ nan_mask = ~np.isnan(position_data)
39
+ # Keep only the methlyation data that has real values
40
+ position_data = position_data[nan_mask]
41
+ # Get the position data coverage
42
+ position_coverage = len(position_data)
43
+ # Get fraction coverage
44
+ fraction_coverage = position_coverage / control_subset.shape[0]
45
+ # Save the position and the position methylation data for the control subset
46
+ adata.uns[f'{cat}_position_methylation_dict_{control}'][f'{position}'] = (position, position_data, fraction_coverage)
47
+
48
+ for cat in categories:
49
+ fig, ax = plt.subplots(figsize=(6, 4))
50
+ plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
51
+ plt.xlabel('False Positive Rate')
52
+ plt.ylabel('True Positive Rate')
53
+ ax.spines['right'].set_visible(False)
54
+ ax.spines['top'].set_visible(False)
55
+ n_passed_positions = 0
56
+ n_total_positions = 0
57
+ # Initialize a list that will hold the positional thresholds for the category
58
+ probability_thresholding_list = [(np.nan, np.nan)] * adata.shape[1]
59
+ for i, key in enumerate(adata.uns[f'{cat}_position_methylation_dict_{positive_control_sample}'].keys()):
60
+ position = int(adata.uns[f'{cat}_position_methylation_dict_{positive_control_sample}'][key][0])
61
+ positive_position_array = adata.uns[f'{cat}_position_methylation_dict_{positive_control_sample}'][key][1]
62
+ fraction_coverage = adata.uns[f'{cat}_position_methylation_dict_{positive_control_sample}'][key][2]
63
+ if fraction_coverage > 0.2:
64
+ try:
65
+ negative_position_array = adata.uns[f'{cat}_position_methylation_dict_{negative_control_sample}'][key][1]
66
+ # Combine the negative and positive control data
67
+ data = np.concatenate([negative_position_array, positive_position_array])
68
+ labels = np.array([0] * len(negative_position_array) + [1] * len(positive_position_array))
69
+ # Calculate the ROC curve
70
+ fpr, tpr, thresholds = roc_curve(labels, data)
71
+ # Calculate Youden's J statistic
72
+ J = tpr - fpr
73
+ optimal_idx = np.argmax(J)
74
+ optimal_threshold = thresholds[optimal_idx]
75
+ max_J = np.max(J)
76
+ data_tuple = (optimal_threshold, max_J)
77
+ probability_thresholding_list[position] = data_tuple
78
+ n_total_positions += 1
79
+ if max_J > J_threshold:
80
+ n_passed_positions += 1
81
+ plt.plot(fpr, tpr, label='ROC curve')
82
+ except:
83
+ probability_thresholding_list[position] = (0.8, np.nan)
84
+ title = f'ROC Curve for {n_passed_positions} positions with J-stat greater than {J_threshold}\n out of {n_total_positions} total positions on {cat}'
85
+ plt.title(title)
86
+ date_string = date_string()
87
+ save_name = output_directory + f'/{date_string} {title}'
88
+ if save:
89
+ plt.savefig(save_name)
90
+ plt.close()
91
+ else:
92
+ plt.show()
93
+ adata.var[f'{cat}_position_methylation_thresholding_Youden_stats'] = probability_thresholding_list
94
+ J_max_list = [probability_thresholding_list[i][1] for i in range(adata.shape[1])]
95
+ adata.var[f'{cat}_position_passed_QC'] = [True if i > J_threshold else False for i in J_max_list]
@@ -0,0 +1,27 @@
1
+ ## calculate_read_length_stats
2
+ import numpy as np
3
+ import anndata as ad
4
+ import pandas as pd
5
+
6
+ # Read length QC
7
+ def calculate_read_length_stats(adata):
8
+ """
9
+ Input: An adata object
10
+ Output: Append first valid position in a read and last valid position in the read. From this determine and append the read length.
11
+ Return two new variable which hold the first and last valid positions in the entire dataset
12
+ """
13
+ ## Add basic observation-level (read-level) metadata to the object: first valid position in a read and last valid position in the read. From this determine the read length. Save two new variable which hold the first and last valid positions in the entire dataset
14
+
15
+ # Add some basic observation-level (read-level) metadata to the anndata object
16
+ read_first_valid_position = np.array([int(adata.var_names[i]) for i in np.argmax(~np.isnan(adata.X), axis=1)])
17
+ read_last_valid_position = np.array([int(adata.var_names[i]) for i in (adata.X.shape[1] - 1 - np.argmax(~np.isnan(adata.X[:, ::-1]), axis=1))])
18
+ read_length = read_last_valid_position - read_first_valid_position + np.ones(len(read_first_valid_position))
19
+
20
+ adata.obs['first_valid_position'] = pd.Series(read_first_valid_position, index=adata.obs.index, dtype=int)
21
+ adata.obs['last_valid_position'] = pd.Series(read_last_valid_position, index=adata.obs.index, dtype=int)
22
+ adata.obs['read_length'] = pd.Series(read_length, index=adata.obs.index, dtype=int)
23
+
24
+ # Define variables to hold the first and last valid position in the dataset
25
+ upper_bound = int(np.nanmax(adata.obs['last_valid_position']))
26
+ lower_bound = int(np.nanmin(adata.obs['first_valid_position']))
27
+ return upper_bound, lower_bound
@@ -0,0 +1,31 @@
1
+ ## clean_NaN
2
+ import numpy as np
3
+ import anndata as ad
4
+ import pandas as pd
5
+
6
+ # NaN handling
7
+ def clean_NaN(adata, layer=None):
8
+ """
9
+ Input: An adata object and the layer to fill Nan values of
10
+ Output: Append layers to adata that contain NaN cleaning strategies
11
+ """
12
+ # Fill NaN with closest SMF value
13
+ df = adata_to_df(adata, layer=layer)
14
+ df = df.ffill(axis=1).bfill(axis=1)
15
+ adata.layers['fill_nans_closest'] = df.values
16
+
17
+ # Replace NaN values with 0, and 0 with minus 1
18
+ old_value, new_value = [0, -1]
19
+ df = adata_to_df(adata, layer=layer)
20
+ df = df.replace(old_value, new_value)
21
+ old_value, new_value = [np.nan, 0]
22
+ df = df.replace(old_value, new_value)
23
+ adata.layers['nan0_0minus1'] = df.values
24
+
25
+ # Replace NaN values with 1, and 1 with 2
26
+ old_value, new_value = [1, 2]
27
+ df = adata_to_df(adata, layer=layer)
28
+ df = df.replace(old_value, new_value)
29
+ old_value, new_value = [np.nan, 1]
30
+ df = df.replace(old_value, new_value)
31
+ adata.layers['nan1_12'] = df.values
@@ -0,0 +1,20 @@
1
+ ## filter_converted_reads_on_methylation
2
+ import numpy as np
3
+ import anndata as ad
4
+ import pandas as pd
5
+
6
+ ## Conversion SMF Specific
7
+ # Read methylation QC
8
+ def filter_converted_reads_on_methylation(adata, valid_SMF_site_threshold=0.8, min_SMF_threshold=0.025):
9
+ """
10
+ Input: Adata object. Minimum thresholds for valid SMF site fraction in read, as well as minimum methylation content in read
11
+ Output: A subset of the adata object
12
+ """
13
+ if valid_SMF_site_threshold:
14
+ # Keep reads that have over a given valid GpC site content
15
+ adata = adata[adata.obs['fraction_valid_GpC_site_in_range'] > valid_SMF_site_threshold].copy()
16
+ if min_SMF_threshold:
17
+ # Keep reads with SMF methylation over background methylation.
18
+ adata = adata[adata.obs['GpC_above_other_C'] == True].copy()
19
+ # Keep reads over a defined methylation threshold
20
+ adata = adata[adata.obs['GpC_site_row_methylation_means'] > min_SMF_threshold].copy()
@@ -0,0 +1,31 @@
1
+ ## filter_reads_on_length
2
+ import numpy as np
3
+ import anndata as ad
4
+ import pandas as pd
5
+
6
+ def filter_reads_on_length(adata, filter_on_coordinates=False, min_read_length=2700):
7
+ """
8
+ Input: Adata object. a list of lower and upper bound (set to False or None if not wanted), and a minimum read length integer.
9
+ Output: Susbets the adata object to keep a defined coordinate window, as well as reads that are over a minimum threshold in length
10
+ """
11
+
12
+ if filter_on_coordinates:
13
+ lower_bound, upper_bound = filter_on_coordinates
14
+ # Extract the position information from the adata object as an np array
15
+ var_names_arr = adata.var_names.astype(int).to_numpy()
16
+ # Find the upper bound coordinate that is closest to the specified value
17
+ closest_end_index = np.argmin(np.abs(var_names_arr - upper_bound))
18
+ upper_bound = int(adata.var_names[closest_end_index])
19
+ # Find the lower bound coordinate that is closest to the specified value
20
+ closest_start_index = np.argmin(np.abs(var_names_arr - lower_bound))
21
+ lower_bound = int(adata.var_names[closest_start_index])
22
+ # Get a list of positional indexes that encompass the lower and upper bounds of the dataset
23
+ position_list = list(range(lower_bound, upper_bound + 1))
24
+ position_list = [str(pos) for pos in position_list]
25
+ position_set = set(position_list)
26
+ print(f'Subsetting adata to keep data between coordinates {lower_bound} and {upper_bound}')
27
+ adata = adata[:, adata.var_names.isin(position_set)].copy()
28
+
29
+ if min_read_length:
30
+ print(f'Subsetting adata to keep reads longer than {min_read_length}')
31
+ adata = adata[adata.obs['read_length'] > min_read_length].copy()
@@ -0,0 +1,18 @@
1
+ ## invert_adata
2
+ import numpy as np
3
+ import anndata as ad
4
+ import pandas as pd
5
+
6
+ # Optional inversion of the adata
7
+ def invert_adata(adata):
8
+ """
9
+ Input: An adata object
10
+ Output: Inverts the adata object along the variable axis
11
+ """
12
+ # Reassign var_names with new names
13
+ old_var_names = adata.var_names.astype(int).to_numpy()
14
+ new_var_names = np.sort(old_var_names)[::-1].astype(str)
15
+ adata.var['Original_positional_coordinate'] = old_var_names.astype(str)
16
+ adata.var_names = new_var_names
17
+ # Sort the AnnData object based on the old var_names
18
+ adata = adata[:, old_var_names.astype(str)]
@@ -0,0 +1,110 @@
1
+ ## mark_duplicates
2
+ import numpy as np
3
+ import pandas as pd
4
+ import matplotlib.pyplot as plt
5
+ from scipy.signal import find_peaks
6
+ import networkx as nx
7
+ from .binary_layers_to_ohe import binary_layers_to_ohe
8
+ from .calculate_pairwise_hamming_distances import calculate_pairwise_hamming_distances
9
+ from .min_non_diagonal import min_non_diagonal
10
+
11
+
12
+ def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_names'):
13
+ """
14
+ Input: adata object, list of binary layers, column names to use.
15
+ Output: Marks duplicates in the adata object
16
+ """
17
+ categories = adata.obs[obs_column].cat.categories
18
+ sample_names = adata.obs[sample_col].cat.categories
19
+
20
+ # Calculate the pairwise Hamming distances within each reference/sample set. Determine distance thresholds for each reference/sample pair
21
+ adata.obs['Nearest_neighbor_Hamming_distance'] = pd.Series(np.nan, index=adata.obs_names, dtype=float)
22
+ for cat in categories:
23
+ cat_subset = adata[adata.obs[obs_column] == cat].copy()
24
+ for sample in sample_names:
25
+ sample_subset = cat_subset[cat_subset.obs[sample_col] == sample].copy()
26
+ # Encode sequencing reads as a one-hot-encodings
27
+ adata.uns[f'{cat}_{sample}_read_OHE_dict'] = binary_layers_to_ohe(sample_subset, layers, stack='hstack')
28
+ # Unpack the read names and one hot encodings into lists
29
+ read_names = []
30
+ ohe_list = []
31
+ for read_name, ohe in adata.uns[f'{cat}_{sample}_read_OHE_dict'].items():
32
+ read_names.append(read_name)
33
+ ohe_list.append(ohe)
34
+ # Calculate the pairwise hamming distances
35
+ print(f'Calculating hamming distances for {sample} on {cat} allele')
36
+ distance_matrix = calculate_pairwise_hamming_distances(ohe_list)
37
+ n_reads = distance_matrix.shape[0]
38
+ # Load the hamming matrix into a dataframe with index and column names as the read_names
39
+ distance_df = pd.DataFrame(distance_matrix, index=read_names, columns=read_names)
40
+ # Save the distance dataframe into an unstructured component of the adata object
41
+ adata.uns[f'Pairwise_Hamming_distance_within_{cat}_{sample}'] = distance_df
42
+ # Calculate the minimum non-self distance for every read in the reference and sample
43
+ min_distance_values = min_non_diagonal(distance_matrix)
44
+ min_distance_df = pd.DataFrame({'Nearest_neighbor_Hamming_distance': min_distance_values}, index=read_names)
45
+ adata.obs.update(min_distance_df)
46
+ # Generate a histogram of minimum non-self distances for each read
47
+ min_distance_bins = plt.hist(min_distance_values, bins=n_reads//4)
48
+ # Normalize the max value in any histogram bin to 1
49
+ normalized_min_distance_counts = min_distance_bins[0] / np.max(min_distance_bins[0])
50
+ # Extract the bin index of peak centers in the histogram
51
+ peak_centers, _ = find_peaks(normalized_min_distance_counts, prominence=0.2, distance=5)
52
+ first_peak_index = peak_centers[0]
53
+ offset_index = first_peak_index-1
54
+ # Use the distance corresponding to the first peak as the threshold distance in graph construction
55
+ first_peak_distance = min_distance_bins[1][first_peak_index]
56
+ offset_distance = min_distance_bins[1][offset_index]
57
+ adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = offset_distance
58
+
59
+ ## Detect likely duplicate reads and mark them in the adata object.
60
+ adata.obs['Marked_duplicate'] = pd.Series(False, index=adata.obs_names, dtype=bool)
61
+ adata.obs['Unique_in_final_read_set'] = pd.Series(False, index=adata.obs_names, dtype=bool)
62
+ adata.obs[f'Hamming_distance_cluster_within_{obs_column}_and_sample'] = pd.Series(-1, index=adata.obs_names, dtype=int)
63
+
64
+ for cat in categories:
65
+ for sample in sample_names:
66
+ distance_df = adata.uns[f'Pairwise_Hamming_distance_within_{cat}_{sample}']
67
+ read_names = distance_df.index
68
+ distance_matrix = distance_df.values
69
+ n_reads = distance_matrix.shape[0]
70
+ distance_threshold = adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}']
71
+ # Initialize the read distance graph
72
+ G = nx.Graph()
73
+ # Add each read as a node to the graph
74
+ G.add_nodes_from(range(n_reads))
75
+ # Add edges based on the threshold
76
+ for i in range(n_reads):
77
+ for j in range(i + 1, n_reads):
78
+ if distance_matrix[i, j] <= distance_threshold:
79
+ G.add_edge(i, j)
80
+ # Determine distinct clusters using connected components
81
+ clusters = list(nx.connected_components(G))
82
+ clusters = [list(cluster) for cluster in clusters]
83
+ # Get the number of clusters
84
+ cluster_count = len(clusters)
85
+ adata.uns[f'Hamming_distance_clusters_within_{cat}_{sample}'] = [cluster_count, n_reads, cluster_count / n_reads, clusters]
86
+ # Update the adata object
87
+ read_cluster_map = {}
88
+ read_duplicate_map = {}
89
+ read_keep_map = {}
90
+ for i, cluster in enumerate(clusters):
91
+ for j, read_index in enumerate(cluster):
92
+ read_name = read_names[read_index]
93
+ read_cluster_map[read_name] = i
94
+ if len(cluster) > 1:
95
+ read_duplicate_map[read_name] = True
96
+ if j == 0:
97
+ read_keep_map[read_name] = True
98
+ else:
99
+ read_keep_map[read_name] = False
100
+ elif len(cluster) == 1:
101
+ read_duplicate_map[read_name] = False
102
+ read_keep_map[read_name] = True
103
+ cluster_df = pd.DataFrame.from_dict(read_cluster_map, orient='index', columns=[f'Hamming_distance_cluster_within_{obs_column}_and_sample'], dtype=int)
104
+ duplicate_df = pd.DataFrame.from_dict(read_duplicate_map, orient='index', columns=['Marked_duplicate'], dtype=bool)
105
+ keep_df = pd.DataFrame.from_dict(read_keep_map, orient='index', columns=['Unique_in_final_read_set'], dtype=bool)
106
+ df_combined = pd.concat([cluster_df, duplicate_df, keep_df], axis=1)
107
+ adata.obs.update(df_combined)
108
+ adata.obs['Marked_duplicate'] = adata.obs['Marked_duplicate'].astype(bool)
109
+ adata.obs['Unique_in_final_read_set'] = adata.obs['Unique_in_final_read_set'].astype(bool)
110
+ print(f'Hamming clusters for {sample} on {cat}\nThreshold: {first_peak_distance}\nNumber clusters: {cluster_count}\nNumber reads: {n_reads}\nFraction unique: {cluster_count / n_reads}')
@@ -0,0 +1,20 @@
1
+ ## min_non_diagonal
2
+ import numpy as np
3
+
4
+ def min_non_diagonal(matrix):
5
+ """
6
+ Takes a matrix and returns the smallest value from each row with the diagonal masked
7
+ Input: A data matrix
8
+ Output: A list of minimum values from each row of the matrix
9
+ """
10
+ n = matrix.shape[0]
11
+ min_values = []
12
+ for i in range(n):
13
+ # Mask to exclude the diagonal element
14
+ row_mask = np.ones(n, dtype=bool)
15
+ row_mask[i] = False
16
+ # Extract the row excluding the diagonal element
17
+ row = matrix[i, row_mask]
18
+ # Find the minimum value in the row
19
+ min_values.append(np.min(row))
20
+ return min_values