smftools 0.1.1__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. smftools-0.1.6.dist-info/METADATA +127 -0
  2. smftools-0.1.6.dist-info/RECORD +4 -0
  3. smftools/__init__.py +0 -25
  4. smftools/_settings.py +0 -19
  5. smftools/_version.py +0 -1
  6. smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
  7. smftools/datasets/__init__.py +0 -9
  8. smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
  9. smftools/datasets/datasets.py +0 -27
  10. smftools/informatics/__init__.py +0 -12
  11. smftools/informatics/bam_conversion.py +0 -47
  12. smftools/informatics/bam_direct.py +0 -49
  13. smftools/informatics/basecalls_to_adata.py +0 -42
  14. smftools/informatics/fast5_to_pod5.py +0 -19
  15. smftools/informatics/helpers/LoadExperimentConfig.py +0 -74
  16. smftools/informatics/helpers/__init__.py +0 -42
  17. smftools/informatics/helpers/align_and_sort_BAM.py +0 -52
  18. smftools/informatics/helpers/archived/informatics.py +0 -260
  19. smftools/informatics/helpers/archived/load_adata.py +0 -516
  20. smftools/informatics/helpers/binarize_converted_base_identities.py +0 -31
  21. smftools/informatics/helpers/canoncall.py +0 -23
  22. smftools/informatics/helpers/converted_BAM_to_adata.py +0 -164
  23. smftools/informatics/helpers/count_aligned_reads.py +0 -39
  24. smftools/informatics/helpers/extract_base_identities.py +0 -43
  25. smftools/informatics/helpers/extract_mods.py +0 -51
  26. smftools/informatics/helpers/find_conversion_sites.py +0 -59
  27. smftools/informatics/helpers/generate_converted_FASTA.py +0 -79
  28. smftools/informatics/helpers/get_native_references.py +0 -28
  29. smftools/informatics/helpers/make_dirs.py +0 -21
  30. smftools/informatics/helpers/make_modbed.py +0 -27
  31. smftools/informatics/helpers/modQC.py +0 -27
  32. smftools/informatics/helpers/modcall.py +0 -26
  33. smftools/informatics/helpers/modkit_extract_to_adata.py +0 -367
  34. smftools/informatics/helpers/one_hot_encode.py +0 -19
  35. smftools/informatics/helpers/separate_bam_by_bc.py +0 -41
  36. smftools/informatics/helpers/split_and_index_BAM.py +0 -29
  37. smftools/informatics/pod5_conversion.py +0 -53
  38. smftools/informatics/pod5_direct.py +0 -55
  39. smftools/informatics/pod5_to_adata.py +0 -40
  40. smftools/informatics/readwrite.py +0 -106
  41. smftools/informatics/subsample_pod5.py +0 -48
  42. smftools/plotting/__init__.py +0 -0
  43. smftools/preprocessing/__init__.py +0 -29
  44. smftools/preprocessing/append_C_context.py +0 -46
  45. smftools/preprocessing/archives/preprocessing.py +0 -614
  46. smftools/preprocessing/binarize_on_Youden.py +0 -42
  47. smftools/preprocessing/binary_layers_to_ohe.py +0 -30
  48. smftools/preprocessing/calculate_complexity.py +0 -71
  49. smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -45
  50. smftools/preprocessing/calculate_coverage.py +0 -41
  51. smftools/preprocessing/calculate_pairwise_hamming_distances.py +0 -27
  52. smftools/preprocessing/calculate_position_Youden.py +0 -104
  53. smftools/preprocessing/calculate_read_length_stats.py +0 -32
  54. smftools/preprocessing/clean_NaN.py +0 -38
  55. smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -27
  56. smftools/preprocessing/filter_reads_on_length.py +0 -39
  57. smftools/preprocessing/invert_adata.py +0 -22
  58. smftools/preprocessing/mark_duplicates.py +0 -119
  59. smftools/preprocessing/min_non_diagonal.py +0 -25
  60. smftools/preprocessing/remove_duplicates.py +0 -18
  61. smftools/readwrite.py +0 -106
  62. smftools/tools/__init__.py +0 -0
  63. smftools-0.1.1.dist-info/METADATA +0 -88
  64. smftools-0.1.1.dist-info/RECORD +0 -64
  65. {smftools-0.1.1.dist-info → smftools-0.1.6.dist-info}/WHEEL +0 -0
  66. {smftools-0.1.1.dist-info → smftools-0.1.6.dist-info}/licenses/LICENSE +0 -0
@@ -1,614 +0,0 @@
1
- ## preprocessing
2
- from .. import readwrite
3
-
4
- # Clustering and stats
5
- from sklearn.metrics import roc_curve, roc_auc_score
6
- from scipy.optimize import curve_fit
7
- from scipy.spatial.distance import pdist, squareform
8
- from scipy.spatial.distance import hamming
9
- import networkx as nx
10
-
11
- # Signal processing
12
- from scipy.signal import find_peaks
13
-
14
- # Plotting
15
- import matplotlib as mpl
16
- import matplotlib.pyplot as plt
17
- import seaborn as sns
18
-
19
- # User interface
20
- from tqdm import tqdm
21
-
22
- output_directory =''
23
-
24
- ######################################################################################################
25
- ## General SMF
26
-
27
- def calculate_coverage(adata, obs_column='Reference', position_nan_threshold=0.05):
28
- """
29
- Input: An adata object and an observation column of interest. Assess if the position is present in the dataset category.
30
- Output: Append position level metadata indicating whether the position is informative within the given observation category.
31
- """
32
- categories = adata.obs[obs_column].cat.categories
33
- n_categories_with_position = np.zeros(adata.shape[1])
34
- # Loop over reference strands
35
- for cat in categories:
36
- # Look at positional information for each reference
37
- temp_cat_adata = adata[adata.obs[obs_column] == cat]
38
- # Look at read coverage on the given category strand
39
- cat_valid_coverage = np.sum(~np.isnan(temp_cat_adata.X), axis=0)
40
- cat_invalid_coverage = np.sum(np.isnan(temp_cat_adata.X), axis=0)
41
- cat_valid_fraction = cat_valid_coverage / (cat_valid_coverage + cat_invalid_coverage)
42
- # Append metadata for category to the anndata object
43
- adata.var[f'{cat}_valid_fraction'] = pd.Series(cat_valid_fraction, index=adata.var.index)
44
- # Characterize if the position is in the given category or not
45
- conditions = [
46
- (adata.var[f'{cat}_valid_fraction'] >= position_nan_threshold),
47
- (adata.var[f'{cat}_valid_fraction'] < position_nan_threshold)
48
- ]
49
- choices = [True, False]
50
- adata.var[f'position_in_{cat}'] = np.select(conditions, choices, default=False)
51
- n_categories_with_position += np.array(adata.var[f'position_in_{cat}'])
52
-
53
- # Final array with the sum at each position of the number of categories covering that position
54
- adata.var[f'N_{obs_column}_with_position'] = n_categories_with_position.astype(int)
55
-
56
- # Optional inversion of the adata
57
- def invert_adata(adata):
58
- """
59
- Input: An adata object
60
- Output: Inverts the adata object along the variable axis
61
- """
62
- # Reassign var_names with new names
63
- old_var_names = adata.var_names.astype(int).to_numpy()
64
- new_var_names = np.sort(old_var_names)[::-1].astype(str)
65
- adata.var['Original_positional_coordinate'] = old_var_names.astype(str)
66
- adata.var_names = new_var_names
67
- # Sort the AnnData object based on the old var_names
68
- adata = adata[:, old_var_names.astype(str)]
69
-
70
- # Read length QC
71
- def calculate_read_length_stats(adata):
72
- """
73
- Input: An adata object
74
- Output: Append first valid position in a read and last valid position in the read. From this determine and append the read length.
75
- Return two new variable which hold the first and last valid positions in the entire dataset
76
- """
77
- ## Add basic observation-level (read-level) metadata to the object: first valid position in a read and last valid position in the read. From this determine the read length. Save two new variable which hold the first and last valid positions in the entire dataset
78
-
79
- # Add some basic observation-level (read-level) metadata to the anndata object
80
- read_first_valid_position = np.array([int(adata.var_names[i]) for i in np.argmax(~np.isnan(adata.X), axis=1)])
81
- read_last_valid_position = np.array([int(adata.var_names[i]) for i in (adata.X.shape[1] - 1 - np.argmax(~np.isnan(adata.X[:, ::-1]), axis=1))])
82
- read_length = read_last_valid_position - read_first_valid_position + np.ones(len(read_first_valid_position))
83
-
84
- adata.obs['first_valid_position'] = pd.Series(read_first_valid_position, index=adata.obs.index, dtype=int)
85
- adata.obs['last_valid_position'] = pd.Series(read_last_valid_position, index=adata.obs.index, dtype=int)
86
- adata.obs['read_length'] = pd.Series(read_length, index=adata.obs.index, dtype=int)
87
-
88
- # Define variables to hold the first and last valid position in the dataset
89
- upper_bound = int(np.nanmax(adata.obs['last_valid_position']))
90
- lower_bound = int(np.nanmin(adata.obs['first_valid_position']))
91
- return upper_bound, lower_bound
92
-
93
- def plot_read_length_QC(adata, lower_bound, upper_bound, obs_column='Reference', sample_col='Sample_names', save=False):
94
- """
95
- """
96
- categories = adata.obs[obs_column].cat.categories
97
- sample_names = adata.obs[sample_col].cat.categories
98
- ## Plot histogram of read length data and save the median and stdev of the read lengths for each sample.
99
- adata.uns['read_length_dict'] = {}
100
- for cat in categories:
101
- temp_cat_adata = adata[adata.obs[obs_column] == cat].copy()
102
- split_cat = cat.split('_')[0][1:]
103
- for sample in sample_names:
104
- temp_sample_adata = temp_cat_adata[temp_cat_adata.obs[sample_col] == sample].copy()
105
- temp_data = temp_sample_adata.obs['read_length']
106
- max_length = np.max(temp_data)
107
- mean = np.mean(temp_data)
108
- median = np.median(temp_data)
109
- stdev = np.std(temp_data)
110
- adata.uns['read_length_dict'][f'{cat}_{sample}'] = [mean, median, stdev]
111
- n_bins = int(max_length // 100)
112
- plt.figure(figsize=(10, 6))
113
- plt.text(median + 0.5, max(plt.hist(temp_data, bins=n_bins)[0]) / 2, f'Median: {median:.2f}', color='red')
114
- plt.hist(temp_data, bins=n_bins, alpha=0.7, color='blue', edgecolor='black')
115
- plt.xlabel('Read Length')
116
- plt.ylabel('Count')
117
- title = f'Read length distribution of {temp_sample_adata.shape[0]} total reads from {sample} sample on {split_cat} allele'
118
- plt.title(title)
119
- # Add a vertical line at the median
120
- plt.axvline(median, color='red', linestyle='dashed', linewidth=1)
121
- # Annotate the median
122
- plt.xlim(lower_bound - 100, upper_bound + 100)
123
- if save:
124
- date_string = date_string()
125
- save_name = output_directory + f'/{date_string} {title}'
126
- plt.savefig(save_name, bbox_inches='tight', pad_inches=0.1)
127
- plt.close()
128
- else:
129
- plt.show()
130
-
131
- def filter_reads_on_length(adata, filter_on_coordinates=False, min_read_length=2700):
132
- """
133
- Input: Adata object. a list of lower and upper bound (set to False or None if not wanted), and a minimum read length integer.
134
- Output: Susbets the adata object to keep a defined coordinate window, as well as reads that are over a minimum threshold in length
135
- """
136
-
137
- if filter_on_coordinates:
138
- lower_bound, upper_bound = filter_on_coordinates
139
- # Extract the position information from the adata object as an np array
140
- var_names_arr = adata.var_names.astype(int).to_numpy()
141
- # Find the upper bound coordinate that is closest to the specified value
142
- closest_end_index = np.argmin(np.abs(var_names_arr - upper_bound))
143
- upper_bound = int(adata.var_names[closest_end_index])
144
- # Find the lower bound coordinate that is closest to the specified value
145
- closest_start_index = np.argmin(np.abs(var_names_arr - lower_bound))
146
- lower_bound = int(adata.var_names[closest_start_index])
147
- # Get a list of positional indexes that encompass the lower and upper bounds of the dataset
148
- position_list = list(range(lower_bound, upper_bound + 1))
149
- position_list = [str(pos) for pos in position_list]
150
- position_set = set(position_list)
151
- print(f'Subsetting adata to keep data between coordinates {lower_bound} and {upper_bound}')
152
- adata = adata[:, adata.var_names.isin(position_set)].copy()
153
-
154
- if min_read_length:
155
- print(f'Subsetting adata to keep reads longer than {min_read_length}')
156
- adata = adata[adata.obs['read_length'] > min_read_length].copy()
157
-
158
- # NaN handling
159
- def clean_NaN(adata, layer=None):
160
- """
161
- Input: An adata object and the layer to fill Nan values of
162
- Output: Append layers to adata that contain NaN cleaning strategies
163
- """
164
- # Fill NaN with closest SMF value
165
- df = adata_to_df(adata, layer=layer)
166
- df = df.ffill(axis=1).bfill(axis=1)
167
- adata.layers['fill_nans_closest'] = df.values
168
-
169
- # Replace NaN values with 0, and 0 with minus 1
170
- old_value, new_value = [0, -1]
171
- df = adata_to_df(adata, layer=layer)
172
- df = df.replace(old_value, new_value)
173
- old_value, new_value = [np.nan, 0]
174
- df = df.replace(old_value, new_value)
175
- adata.layers['nan0_0minus1'] = df.values
176
-
177
- # Replace NaN values with 1, and 1 with 2
178
- old_value, new_value = [1, 2]
179
- df = adata_to_df(adata, layer=layer)
180
- df = df.replace(old_value, new_value)
181
- old_value, new_value = [np.nan, 1]
182
- df = df.replace(old_value, new_value)
183
- adata.layers['nan1_12'] = df.values
184
-
185
- ######################################################################################################
186
-
187
- ######################################################################################################
188
- ## Conversion SMF Specific
189
- ##############################################
190
-
191
- # Read methylation QC
192
- def append_C_context(adata, obs_column='Reference', use_consensus=False):
193
- """
194
- Input: An adata object, the obs_column of interst, and whether to use the consensus sequence from the category.
195
- Output: Adds Cytosine context to the position within the given category. When use_consensus is True, it uses the consensus sequence, otherwise it defaults to the FASTA sequence.
196
- """
197
- site_types = ['GpC_site', 'CpG_site', 'ambiguous_GpC_site', 'ambiguous_CpG_site', 'other_C']
198
- categories = adata.obs[obs_column].cat.categories
199
- if use_consensus:
200
- sequence = adata.uns[f'{cat}_consensus_sequence']
201
- else:
202
- sequence = adata.uns[f'{cat}_FASTA_sequence']
203
- for cat in categories:
204
- boolean_dict = {}
205
- for site_type in site_types:
206
- boolean_dict[f'{cat}_{site_type}'] = np.full(len(sequence), False, dtype=bool)
207
- # Iterate through the sequence and apply the criteria
208
- for i in range(1, len(sequence) - 1):
209
- if sequence[i] == 'C':
210
- if sequence[i - 1] == 'G' and sequence[i + 1] != 'G':
211
- boolean_dict[f'{cat}_GpC_site'][i] = True
212
- elif sequence[i - 1] == 'G' and sequence[i + 1] == 'G':
213
- boolean_dict[f'{cat}_ambiguous_GpC_site'][i] = True
214
- elif sequence[i - 1] != 'G' and sequence[i + 1] == 'G':
215
- boolean_dict[f'{cat}_CpG_site'][i] = True
216
- elif sequence[i - 1] == 'G' and sequence[i + 1] == 'G':
217
- boolean_dict[f'{cat}_ambiguous_CpG_site'][i] = True
218
- elif sequence[i - 1] != 'G' and sequence[i + 1] != 'G':
219
- boolean_dict[f'{cat}_other_C'][i] = True
220
- for site_type in site_types:
221
- adata.var[f'{cat}_{site_type}'] = boolean_dict[f'{cat}_{site_type}'].astype(bool)
222
- adata.obsm[f'{cat}_{site_type}'] = adata[:, adata.var[f'{cat}_{site_type}'] == True].copy().X
223
-
224
- def calculate_read_methylation_stats(adata, obs_column='Reference'):
225
- """
226
- Input: adata and the observation category of interest
227
- Output: Adds methylation statistics for each read. Indicates whether the read GpC methylation exceeded other_C methylation (background false positives)
228
- """
229
- site_types = ['GpC_site', 'CpG_site', 'ambiguous_GpC_site', 'ambiguous_CpG_site', 'other_C']
230
- categories = adata.obs[obs_column].cat.categories
231
- for site_type in site_types:
232
- adata.obs[f'{site_type}_row_methylation_sums'] = pd.Series(0, index=adata.obs_names, dtype=int)
233
- adata.obs[f'{site_type}_row_methylation_means'] = pd.Series(np.nan, index=adata.obs_names, dtype=float)
234
- adata.obs[f'number_valid_{site_type}_in_read'] = pd.Series(0, index=adata.obs_names, dtype=int)
235
- adata.obs[f'fraction_valid_{site_type}_in_range'] = pd.Series(np.nan, index=adata.obs_names, dtype=float)
236
- for cat in categories:
237
- cat_subset = adata[adata.obs[obs_column] == cat].copy()
238
- for site_type in site_types:
239
- print(f'Iterating over {cat}_{site_type}')
240
- observation_matrix = cat_subset.obsm[f'{cat}_{site_type}']
241
- number_valid_positions_in_read = np.nansum(~np.isnan(observation_matrix), axis=1)
242
- row_methylation_sums = np.nansum(observation_matrix, axis=1)
243
- number_valid_positions_in_read[number_valid_positions_in_read == 0] = 1
244
- fraction_valid_positions_in_range = number_valid_positions_in_read / np.max(number_valid_positions_in_read)
245
- row_methylation_means = np.divide(row_methylation_sums, number_valid_positions_in_read)
246
- temp_obs_data = pd.DataFrame({f'number_valid_{site_type}_in_read': number_valid_positions_in_read,
247
- f'fraction_valid_{site_type}_in_range': fraction_valid_positions_in_range,
248
- f'{site_type}_row_methylation_sums': row_methylation_sums,
249
- f'{site_type}_row_methylation_means': row_methylation_means}, index=cat_subset.obs.index)
250
- adata.obs.update(temp_obs_data)
251
- # Indicate whether the read-level GpC methylation rate exceeds the false methylation rate of the read
252
- pass_array = np.array(adata.obs[f'GpC_site_row_methylation_means'] > adata.obs[f'other_C_row_methylation_means'])
253
- adata.obs['GpC_above_other_C'] = pd.Series(pass_array, index=adata.obs.index, dtype=bool)
254
-
255
- def filter_reads_on_methylation(adata, valid_SMF_site_threshold=0.8, min_SMF_threshold=0.025):
256
- """
257
- Input: Adata object. Minimum thresholds for valid SMF site fraction in read, as well as minimum methylation content in read
258
- Output: A subset of the adata object
259
- """
260
- if valid_SMF_site_threshold:
261
- # Keep reads that have over a given valid GpC site content
262
- adata = adata[adata.obs['fraction_valid_GpC_site_in_range'] > valid_SMF_site_threshold].copy()
263
- if min_SMF_threshold:
264
- # Keep reads with SMF methylation over background methylation.
265
- adata = adata[adata.obs['GpC_above_other_C'] == True].copy()
266
- # Keep reads over a defined methylation threshold
267
- adata = adata[adata.obs['GpC_site_row_methylation_means'] > min_SMF_threshold].copy()
268
-
269
- # PCR duplicate detection and complexity analysis.
270
- def binary_layers_to_ohe(adata, layers, stack='hstack'):
271
- """
272
- Input: An adata object and a list of layers containing a binary encoding.
273
- Output: A dictionary keyed by obs_name that points to a stacked (hstack or vstack) one-hot encoding of the binary layers
274
- """
275
- # Extract the layers
276
- layers = [adata.layers[layer_name] for layer_name in layers]
277
- n_reads = layers[0].shape[0]
278
- ohe_dict = {}
279
- for i in range(n_reads):
280
- read_ohe = []
281
- for layer in layers:
282
- read_ohe.append(layer[i])
283
- read_name = adata.obs_names[i]
284
- if stack == 'hstack':
285
- ohe_dict[read_name] = np.hstack(read_ohe)
286
- elif stack == 'vstack':
287
- ohe_dict[read_name] = np.vstack(read_ohe)
288
- return ohe_dict
289
-
290
- def calculate_pairwise_hamming_distances(arrays):
291
- """
292
- Calculate the pairwise Hamming distances for a list of ndarrays.
293
- Input: A list of ndarrays
294
- Output: a 2D array containing the pairwise Hamming distances.
295
- """
296
- num_arrays = len(arrays)
297
- # Initialize an empty distance matrix
298
- distance_matrix = np.zeros((num_arrays, num_arrays))
299
- # Calculate pairwise distances with progress bar
300
- for i in tqdm(range(num_arrays), desc="Calculating Hamming Distances"):
301
- for j in range(i + 1, num_arrays):
302
- distance = hamming(arrays[i], arrays[j])
303
- distance_matrix[i, j] = distance
304
- distance_matrix[j, i] = distance
305
- return distance_matrix
306
-
307
- def min_non_diagonal(matrix):
308
- """
309
- Takes a matrix and returns the smallest value from each row with the diagonal masked
310
- Input: A data matrix
311
- Output: A list of minimum values from each row of the matrix
312
- """
313
- n = matrix.shape[0]
314
- min_values = []
315
- for i in range(n):
316
- # Mask to exclude the diagonal element
317
- row_mask = np.ones(n, dtype=bool)
318
- row_mask[i] = False
319
- # Extract the row excluding the diagonal element
320
- row = matrix[i, row_mask]
321
- # Find the minimum value in the row
322
- min_values.append(np.min(row))
323
- return min_values
324
-
325
- def lander_waterman(x, C0):
326
- return C0 * (1 - np.exp(-x / C0))
327
-
328
- def count_unique_reads(reads, depth):
329
- subsample = np.random.choice(reads, depth, replace=False)
330
- return len(np.unique(subsample))
331
-
332
- def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_names'):
333
- """
334
- Input: adata object, list of binary layers, column names to use.
335
- Output: Marks duplicates in the adata object
336
- """
337
- categories = adata.obs[obs_column].cat.categories
338
- sample_names = adata.obs[sample_col].cat.categories
339
-
340
- # Calculate the pairwise Hamming distances within each reference/sample set. Determine distance thresholds for each reference/sample pair
341
- adata.obs['Nearest_neighbor_Hamming_distance'] = pd.Series(np.nan, index=adata.obs_names, dtype=float)
342
- for cat in categories:
343
- cat_subset = adata[adata.obs[obs_column] == cat].copy()
344
- for sample in sample_names:
345
- sample_subset = cat_subset[cat_subset.obs[sample_col] == sample].copy()
346
- # Encode sequencing reads as a one-hot-encodings
347
- adata.uns[f'{cat}_{sample}_read_OHE_dict'] = binary_layers_to_ohe(sample_subset, layers, stack='hstack')
348
- # Unpack the read names and one hot encodings into lists
349
- read_names = []
350
- ohe_list = []
351
- for read_name, ohe in adata.uns[f'{cat}_{sample}_read_OHE_dict'].items():
352
- read_names.append(read_name)
353
- ohe_list.append(ohe)
354
- # Calculate the pairwise hamming distances
355
- print(f'Calculating hamming distances for {sample} on {cat} allele')
356
- distance_matrix = calculate_pairwise_hamming_distances(ohe_list)
357
- n_reads = distance_matrix.shape[0]
358
- # Load the hamming matrix into a dataframe with index and column names as the read_names
359
- distance_df = pd.DataFrame(distance_matrix, index=read_names, columns=read_names)
360
- # Save the distance dataframe into an unstructured component of the adata object
361
- adata.uns[f'Pairwise_Hamming_distance_within_{cat}_{sample}'] = distance_df
362
- # Calculate the minimum non-self distance for every read in the reference and sample
363
- min_distance_values = min_non_diagonal(distance_matrix)
364
- min_distance_df = pd.DataFrame({'Nearest_neighbor_Hamming_distance': min_distance_values}, index=read_names)
365
- adata.obs.update(min_distance_df)
366
- # Generate a histogram of minimum non-self distances for each read
367
- min_distance_bins = plt.hist(min_distance_values, bins=n_reads//4)
368
- # Normalize the max value in any histogram bin to 1
369
- normalized_min_distance_counts = min_distance_bins[0] / np.max(min_distance_bins[0])
370
- # Extract the bin index of peak centers in the histogram
371
- peak_centers, _ = find_peaks(normalized_min_distance_counts, prominence=0.2, distance=5)
372
- first_peak_index = peak_centers[0]
373
- offset_index = first_peak_index-1
374
- # Use the distance corresponding to the first peak as the threshold distance in graph construction
375
- first_peak_distance = min_distance_bins[1][first_peak_index]
376
- offset_distance = min_distance_bins[1][offset_index]
377
- adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = offset_distance
378
-
379
- ## Detect likely duplicate reads and mark them in the adata object.
380
- adata.obs['Marked_duplicate'] = pd.Series(False, index=adata.obs_names, dtype=bool)
381
- adata.obs['Unique_in_final_read_set'] = pd.Series(False, index=adata.obs_names, dtype=bool)
382
- adata.obs[f'Hamming_distance_cluster_within_{obs_column}_and_sample'] = pd.Series(-1, index=adata.obs_names, dtype=int)
383
-
384
- for cat in categories:
385
- for sample in sample_names:
386
- distance_df = adata.uns[f'Pairwise_Hamming_distance_within_{cat}_{sample}']
387
- read_names = distance_df.index
388
- distance_matrix = distance_df.values
389
- n_reads = distance_matrix.shape[0]
390
- distance_threshold = adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}']
391
- # Initialize the read distance graph
392
- G = nx.Graph()
393
- # Add each read as a node to the graph
394
- G.add_nodes_from(range(n_reads))
395
- # Add edges based on the threshold
396
- for i in range(n_reads):
397
- for j in range(i + 1, n_reads):
398
- if distance_matrix[i, j] <= distance_threshold:
399
- G.add_edge(i, j)
400
- # Determine distinct clusters using connected components
401
- clusters = list(nx.connected_components(G))
402
- clusters = [list(cluster) for cluster in clusters]
403
- # Get the number of clusters
404
- cluster_count = len(clusters)
405
- adata.uns[f'Hamming_distance_clusters_within_{cat}_{sample}'] = [cluster_count, n_reads, cluster_count / n_reads, clusters]
406
- # Update the adata object
407
- read_cluster_map = {}
408
- read_duplicate_map = {}
409
- read_keep_map = {}
410
- for i, cluster in enumerate(clusters):
411
- for j, read_index in enumerate(cluster):
412
- read_name = read_names[read_index]
413
- read_cluster_map[read_name] = i
414
- if len(cluster) > 1:
415
- read_duplicate_map[read_name] = True
416
- if j == 0:
417
- read_keep_map[read_name] = True
418
- else:
419
- read_keep_map[read_name] = False
420
- elif len(cluster) == 1:
421
- read_duplicate_map[read_name] = False
422
- read_keep_map[read_name] = True
423
- cluster_df = pd.DataFrame.from_dict(read_cluster_map, orient='index', columns=[f'Hamming_distance_cluster_within_{obs_column}_and_sample'], dtype=int)
424
- duplicate_df = pd.DataFrame.from_dict(read_duplicate_map, orient='index', columns=['Marked_duplicate'], dtype=bool)
425
- keep_df = pd.DataFrame.from_dict(read_keep_map, orient='index', columns=['Unique_in_final_read_set'], dtype=bool)
426
- df_combined = pd.concat([cluster_df, duplicate_df, keep_df], axis=1)
427
- adata.obs.update(df_combined)
428
- adata.obs['Marked_duplicate'] = adata.obs['Marked_duplicate'].astype(bool)
429
- adata.obs['Unique_in_final_read_set'] = adata.obs['Unique_in_final_read_set'].astype(bool)
430
- print(f'Hamming clusters for {sample} on {cat}\nThreshold: {first_peak_distance}\nNumber clusters: {cluster_count}\nNumber reads: {n_reads}\nFraction unique: {cluster_count / n_reads}')
431
-
432
- def plot_complexity(adata, obs_column='Reference', sample_col='Sample_names', plot=True, save_plot=False):
433
- """
434
- Input: adata object with mark_duplicates already run.
435
- Output: A complexity analysis of the library
436
- """
437
- categories = adata.obs[obs_column].cat.categories
438
- sample_names = adata.obs[sample_col].cat.categories
439
-
440
- for cat in categories:
441
- for sample in sample_names:
442
- unique_reads, total_reads = adata.uns[f'Hamming_distance_clusters_within_{cat}_{sample}'][0:2]
443
- reads = np.concatenate((np.arange(unique_reads), np.random.choice(unique_reads, total_reads - unique_reads, replace=True)))
444
- # Subsampling depths
445
- subsampling_depths = [total_reads // (i+1) for i in range(10)]
446
- # Arrays to store results
447
- subsampled_total_reads = []
448
- subsampled_unique_reads = []
449
- # Perform subsampling
450
- for depth in subsampling_depths:
451
- unique_count = count_unique_reads(reads, depth)
452
- subsampled_total_reads.append(depth)
453
- subsampled_unique_reads.append(unique_count)
454
- # Fit the Lander-Waterman model to the data
455
- popt, _ = curve_fit(lander_waterman, subsampled_total_reads, subsampled_unique_reads)
456
- # Generate data for the complexity curve
457
- x_data = np.linspace(0, 5000, 100)
458
- y_data = lander_waterman(x_data, *popt)
459
- adata.uns[f'Library_complexity_{sample}_on_{cat}'] = popt[0]
460
- if plot:
461
- # Plot the complexity curve
462
- plt.figure(figsize=(6, 4))
463
- plt.plot(total_reads, unique_reads, 'o', label='Observed unique reads')
464
- plt.plot(x_data, y_data, '-', label=f'Lander-Waterman fit\nEstimated C0 = {popt[0]:.2f}')
465
- plt.xlabel('Total number of reads')
466
- plt.ylabel('Number of unique reads')
467
- title = f'Library Complexity Analysis for {sample} on {cat}'
468
- plt.title(title)
469
- plt.legend()
470
- plt.grid(True)
471
- if save_plot:
472
- date_string = date_string()
473
- save_name = output_directory + f'/{date_string} {title}'
474
- plt.savefig(save_name, bbox_inches='tight', pad_inches=0.1)
475
- plt.close()
476
- else:
477
- plt.show()
478
-
479
- def remove_duplicates(adata):
480
- """
481
- Input: adata object with marked duplicates
482
- Output: Remove duplicates from the adata object
483
- """
484
- initial_size = adata.shape[0]
485
- adata = adata[adata.obs['Unique_in_final_read_set'] == True].copy()
486
- final_size = adata.shape[0]
487
- print(f'Removed {initial_size-final_size} reads from the dataset')
488
- ######################################################################################################
489
-
490
- ######################################################################################################
491
- ## Direct methylation SMF Specific
492
- ##############################################
493
- ## Calculating and applying position level thresholds for methylation calls to binarize the SMF data
494
- def calculate_position_Youden(adata, positive_control_sample, negative_control_sample, J_threshold=0.4, obs_column='Reference', save=False):
495
- """
496
- Input: An adata object, a plus MTase control, a minus MTase control, the minimal J-statistic threshold, and a categorical observation column to iterate over.
497
- Input notes: The control samples are passed as string names of the samples as they appear in the 'Sample_names' obs column
498
- Output: Adds new variable metadata to each position indicating whether the position provides reliable SMF methylation calls. Also outputs plots of the positional ROC curves.
499
- Can optionally save the output plots of the ROC curve
500
- """
501
- control_samples = [positive_control_sample, negative_control_sample]
502
- categories = adata.obs[obs_column].cat.categories
503
- # Iterate over each category in the specified obs_column
504
- for cat in categories:
505
- # Subset to keep only reads associated with the category
506
- cat_subset = adata[adata.obs[obs_column] == cat].copy()
507
- # Iterate over positive and negative control samples
508
- for control in control_samples:
509
- # Initialize a dictionary for the given control sample. This will be keyed by dataset and position to point to a tuple of coordinate position and an array of methylation probabilities
510
- adata.uns[f'{cat}_position_methylation_dict_{control}'] = {}
511
- # get the current control subset on the given category
512
- filtered_obs = cat_subset.obs[cat_subset.obs['Sample_names'].str.contains(control, na=False, regex=True)]
513
- control_subset = cat_subset[filtered_obs.index].copy()
514
- # Iterate through every position in the control subset
515
- for position in range(control_subset.shape[1]):
516
- # Get the coordinate name associated with that position
517
- coordinate = control_subset.var_names[position]
518
- # Get the array of methlyation probabilities for each read in the subset at that position
519
- position_data = control_subset.X[:, position]
520
- # Get the indexes of everywhere that is not a nan value
521
- nan_mask = ~np.isnan(position_data)
522
- # Keep only the methlyation data that has real values
523
- position_data = position_data[nan_mask]
524
- # Get the position data coverage
525
- position_coverage = len(position_data)
526
- # Get fraction coverage
527
- fraction_coverage = position_coverage / control_subset.shape[0]
528
- # Save the position and the position methylation data for the control subset
529
- adata.uns[f'{cat}_position_methylation_dict_{control}'][f'{position}'] = (position, position_data, fraction_coverage)
530
-
531
- for cat in categories:
532
- fig, ax = plt.subplots(figsize=(6, 4))
533
- plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
534
- plt.xlabel('False Positive Rate')
535
- plt.ylabel('True Positive Rate')
536
- ax.spines['right'].set_visible(False)
537
- ax.spines['top'].set_visible(False)
538
- n_passed_positions = 0
539
- n_total_positions = 0
540
- # Initialize a list that will hold the positional thresholds for the category
541
- probability_thresholding_list = [(np.nan, np.nan)] * adata.shape[1]
542
- for i, key in enumerate(adata.uns[f'{cat}_position_methylation_dict_{positive_control_sample}'].keys()):
543
- position = int(adata.uns[f'{cat}_position_methylation_dict_{positive_control_sample}'][key][0])
544
- positive_position_array = adata.uns[f'{cat}_position_methylation_dict_{positive_control_sample}'][key][1]
545
- fraction_coverage = adata.uns[f'{cat}_position_methylation_dict_{positive_control_sample}'][key][2]
546
- if fraction_coverage > 0.2:
547
- try:
548
- negative_position_array = adata.uns[f'{cat}_position_methylation_dict_{negative_control_sample}'][key][1]
549
- # Combine the negative and positive control data
550
- data = np.concatenate([negative_position_array, positive_position_array])
551
- labels = np.array([0] * len(negative_position_array) + [1] * len(positive_position_array))
552
- # Calculate the ROC curve
553
- fpr, tpr, thresholds = roc_curve(labels, data)
554
- # Calculate Youden's J statistic
555
- J = tpr - fpr
556
- optimal_idx = np.argmax(J)
557
- optimal_threshold = thresholds[optimal_idx]
558
- max_J = np.max(J)
559
- data_tuple = (optimal_threshold, max_J)
560
- probability_thresholding_list[position] = data_tuple
561
- n_total_positions += 1
562
- if max_J > J_threshold:
563
- n_passed_positions += 1
564
- plt.plot(fpr, tpr, label='ROC curve')
565
- except:
566
- probability_thresholding_list[position] = (0.8, np.nan)
567
- title = f'ROC Curve for {n_passed_positions} positions with J-stat greater than {J_threshold}\n out of {n_total_positions} total positions on {cat}'
568
- plt.title(title)
569
- date_string = date_string()
570
- save_name = output_directory + f'/{date_string} {title}'
571
- if save:
572
- plt.savefig(save_name)
573
- plt.close()
574
- else:
575
- plt.show()
576
- adata.var[f'{cat}_position_methylation_thresholding_Youden_stats'] = probability_thresholding_list
577
- J_max_list = [probability_thresholding_list[i][1] for i in range(adata.shape[1])]
578
- adata.var[f'{cat}_position_passed_QC'] = [True if i > J_threshold else False for i in J_max_list]
579
-
580
- def binarize_on_Youden(adata, obs_column='Reference'):
581
- """
582
- Input: adata object that has had calculate_position_Youden called on it.
583
- Output: Add a new layer to the adata object that has binarized SMF values based on the position thresholds determined by calculate_position_Youden
584
- """
585
- temp_adata = None
586
- categories = adata.obs[obs_column].cat.categories
587
- for cat in categories:
588
- # Get the category subset
589
- cat_subset = adata[adata.obs[obs_column] == cat].copy()
590
- # extract the probability matrix for the category subset
591
- original_matrix = cat_subset.X
592
- # extract the learned methylation call thresholds for each position in the category.
593
- thresholds = [cat_subset.var[f'{cat}_position_methylation_thresholding_Youden_stats'][i][0] for i in range(cat_subset.shape[1])]
594
- # In the original matrix, get all positions that are nan values
595
- nan_mask = np.isnan(original_matrix)
596
- # Binarize the matrix on the new thresholds
597
- binarized_matrix = (original_matrix > thresholds).astype(float)
598
- # At the original positions that had nan values, replace the values with nans again
599
- binarized_matrix[nan_mask] = np.nan
600
- # Make a new layer for the reference that contains the binarized methylation calls
601
- cat_subset.layers['binarized_methylation'] = binarized_matrix
602
- if temp_adata:
603
- # If temp_data already exists, concatenate
604
- temp_adata = ad.concat([temp_adata, cat_subset], join='outer', index_unique=None).copy()
605
- else:
606
- # If temp_adata is still None, initialize temp_adata with reference_subset
607
- temp_adata = cat_subset.copy()
608
-
609
- # Sort the temp adata on the index names of the primary adata
610
- temp_adata = temp_adata[adata.obs_names].copy()
611
- # Pull back the new binarized layers into the original adata object
612
- adata.layers['binarized_methylation'] = temp_adata.layers['binarized_methylation']
613
-
614
- ######################################################################################################
@@ -1,42 +0,0 @@
1
- ## binarize_on_Youden
2
-
3
- def binarize_on_Youden(adata, obs_column='Reference'):
4
- """
5
- Add a new layer to the adata object that has binarized SMF values based on the position thresholds determined by calculate_position_Youden
6
-
7
- Parameters:
8
- adata (AnnData): The anndata object to binarize. pp.calculate_position_Youden function has to be run first.
9
- obs_column (str): The obs_column to stratify on. Needs to be the same as passed in pp.calculate_position_Youden.
10
- Input: adata object that has had calculate_position_Youden called on it.
11
- Output:
12
- """
13
- import numpy as np
14
- import anndata as ad
15
- temp_adata = None
16
- categories = adata.obs[obs_column].cat.categories
17
- for cat in categories:
18
- # Get the category subset
19
- cat_subset = adata[adata.obs[obs_column] == cat].copy()
20
- # extract the probability matrix for the category subset
21
- original_matrix = cat_subset.X
22
- # extract the learned methylation call thresholds for each position in the category.
23
- thresholds = [cat_subset.var[f'{cat}_position_methylation_thresholding_Youden_stats'][i][0] for i in range(cat_subset.shape[1])]
24
- # In the original matrix, get all positions that are nan values
25
- nan_mask = np.isnan(original_matrix)
26
- # Binarize the matrix on the new thresholds
27
- binarized_matrix = (original_matrix > thresholds).astype(float)
28
- # At the original positions that had nan values, replace the values with nans again
29
- binarized_matrix[nan_mask] = np.nan
30
- # Make a new layer for the reference that contains the binarized methylation calls
31
- cat_subset.layers['binarized_methylation'] = binarized_matrix
32
- if temp_adata:
33
- # If temp_data already exists, concatenate
34
- temp_adata = ad.concat([temp_adata, cat_subset], join='outer', index_unique=None).copy()
35
- else:
36
- # If temp_adata is still None, initialize temp_adata with reference_subset
37
- temp_adata = cat_subset.copy()
38
-
39
- # Sort the temp adata on the index names of the primary adata
40
- temp_adata = temp_adata[adata.obs_names].copy()
41
- # Pull back the new binarized layers into the original adata object
42
- adata.layers['binarized_methylation'] = temp_adata.layers['binarized_methylation']