smftools 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. smftools/__init__.py +27 -0
  2. smftools/_settings.py +19 -0
  3. smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
  4. smftools/datasets/__init__.py +9 -0
  5. smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
  6. smftools/datasets/datasets.py +25 -0
  7. smftools/informatics/__init__.py +11 -0
  8. smftools/informatics/helpers/__init__.py +42 -0
  9. smftools/informatics/helpers/align_BAM.py +49 -0
  10. smftools/informatics/helpers/binarize_converted_base_identities.py +24 -0
  11. smftools/informatics/helpers/canoncall.py +12 -0
  12. smftools/informatics/helpers/converted_BAM_to_adata.py +147 -0
  13. smftools/informatics/helpers/count_aligned_reads.py +32 -0
  14. smftools/informatics/helpers/extract_base_identities.py +36 -0
  15. smftools/informatics/helpers/extract_mods.py +39 -0
  16. smftools/informatics/helpers/find_conversion_sites.py +53 -0
  17. smftools/informatics/helpers/generate_converted_FASTA.py +59 -0
  18. smftools/informatics/helpers/get_native_references.py +25 -0
  19. smftools/informatics/helpers/informatics.py +260 -0
  20. smftools/informatics/helpers/load_adata.py +516 -0
  21. smftools/informatics/helpers/load_experiment_config.py +17 -0
  22. smftools/informatics/helpers/make_dirs.py +15 -0
  23. smftools/informatics/helpers/make_modbed.py +21 -0
  24. smftools/informatics/helpers/modQC.py +19 -0
  25. smftools/informatics/helpers/modcall.py +14 -0
  26. smftools/informatics/helpers/modkit_extract_to_adata.py +355 -0
  27. smftools/informatics/helpers/one_hot_encode.py +14 -0
  28. smftools/informatics/helpers/separate_bam_by_bc.py +28 -0
  29. smftools/informatics/helpers/split_and_index_BAM.py +21 -0
  30. smftools/informatics/pod5_conversion.py +26 -0
  31. smftools/informatics/pod5_direct.py +29 -0
  32. smftools/informatics/pod5_to_adata.py +17 -0
  33. smftools/informatics/readwrite.py +109 -0
  34. smftools/plotting/__init__.py +0 -0
  35. smftools/preprocessing/__init__.py +35 -0
  36. smftools/preprocessing/append_C_context.py +39 -0
  37. smftools/preprocessing/binarize_on_Youden.py +38 -0
  38. smftools/preprocessing/binary_layers_to_ohe.py +25 -0
  39. smftools/preprocessing/calculate_complexity.py +59 -0
  40. smftools/preprocessing/calculate_converted_read_methylation_stats.py +38 -0
  41. smftools/preprocessing/calculate_coverage.py +35 -0
  42. smftools/preprocessing/calculate_pairwise_hamming_distances.py +22 -0
  43. smftools/preprocessing/calculate_position_Youden.py +95 -0
  44. smftools/preprocessing/calculate_read_length_stats.py +27 -0
  45. smftools/preprocessing/clean_NaN.py +31 -0
  46. smftools/preprocessing/filter_converted_reads_on_methylation.py +20 -0
  47. smftools/preprocessing/filter_reads_on_length.py +31 -0
  48. smftools/preprocessing/invert_adata.py +18 -0
  49. smftools/preprocessing/mark_duplicates.py +110 -0
  50. smftools/preprocessing/min_non_diagonal.py +20 -0
  51. smftools/preprocessing/preprocessing.py +614 -0
  52. smftools/preprocessing/remove_duplicates.py +12 -0
  53. smftools/readwrite.py +109 -0
  54. smftools/tools/__init__.py +0 -0
  55. smftools-0.1.0.dist-info/METADATA +75 -0
  56. smftools-0.1.0.dist-info/RECORD +58 -0
  57. smftools-0.1.0.dist-info/WHEEL +4 -0
  58. smftools-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,355 @@
1
+ ## modkit_extract_to_adata
2
+ from .. import readwrite
3
+ from .get_native_references import get_native_references
4
+ from .count_aligned_reads import count_aligned_reads
5
+ from .extract_base_identities import extract_base_identities
6
+ from .one_hot_encode import one_hot_encode
7
+ import pandas as pd
8
+ import anndata as ad
9
+ import os
10
+ import gc
11
+ import math
12
+ import numpy as np
13
+
14
+ def modkit_extract_to_adata(fasta, bam, mapping_threshold, experiment_name, mods, batch_size):
15
+ """
16
+
17
+ """
18
+ ###################################################
19
+ ### Get input tsv file names into a sorted list ###
20
+ # List all files in the directory
21
+ files = os.listdir(os.getcwd())
22
+ # get current working directory
23
+ cwd = os.getcwd()
24
+ # Filter file names that contain the search string in their filename and keep them in a list
25
+ tsvs = [tsv for tsv in files if 'extract.tsv' in tsv]
26
+ # Sort file list by names and print the list of file names
27
+ tsvs.sort()
28
+ print(f'{len(tsvs)} sample tsv files found: {tsvs}')
29
+ print(f'sample bam file found: {bam}')
30
+
31
+ # Get all references within the FASTA and indicate the length and identity of the record sequence
32
+ max_reference_length = 0
33
+ reference_dict = get_native_references(fasta)
34
+ for record in reference_dict.keys():
35
+ if reference_dict[record][0] > max_reference_length:
36
+ max_reference_length = reference_dict[record][0]
37
+
38
+ print(f'{readwrite.time_string()}: Max reference length in dataset: {max_reference_length}')
39
+ batches = math.ceil(len(tsvs) / batch_size) # Number of batches to process
40
+ print('{0}: Processing input tsvs in {1} batches of {2} tsvs '.format(readwrite.time_string(), batches, batch_size))
41
+
42
+ # look at aligned read proportions in the bam
43
+ aligned_reads_count, unaligned_reads_count, record_counts = count_aligned_reads(bam)
44
+ print('{} percent of reads in bam aligned successfully'.format(aligned_reads_count*100 / (aligned_reads_count+unaligned_reads_count)))
45
+ records_to_analyze = []
46
+ # Iterate over references and decide which to use in the analysis based on the mapping_threshold
47
+ for record in record_counts:
48
+ print('{0} reads mapped to reference record {1}. This is {2} percent of all mapped reads'.format(record_counts[record][0], record, record_counts[record][1]*100))
49
+ if record_counts[record][1] >= mapping_threshold:
50
+ records_to_analyze.append(record)
51
+ print(f'Records to analyze: {records_to_analyze}')
52
+ # Iterate over records to analyze and return a dictionary keyed by the reference name that points to another dictionary keyed by read names that map to that reference. This internal dictionary points to a one-hot encoding of the mapped read
53
+ record_seq_dict = {}
54
+ for record in records_to_analyze:
55
+ current_reference_length = reference_dict[record][0]
56
+ delta_max_length = max_reference_length - current_reference_length
57
+ sequence = reference_dict[record][1] + 'N'*delta_max_length
58
+ # Get a dictionary of positional base identities keyed by read id
59
+ base_identities = extract_base_identities(bam, record, current_reference_length, max_reference_length)
60
+ # One hot encode the sequence string of the reads
61
+ one_hot_reads = {read_name: one_hot_encode(seq) for read_name, seq in base_identities.items()}
62
+ record_seq_dict[record] = (one_hot_reads, sequence)
63
+
64
+ ###################################################
65
+
66
+ ###################################################
67
+ # Begin iterating over batches
68
+ for batch in range(batches):
69
+ print('{0}: Processing tsvs for batch {1} '.format(readwrite.time_string(), batch))
70
+ # For the final batch, just take the remaining tsv files
71
+ if batch == batches - 1:
72
+ tsv_batch = tsvs
73
+ # For all other batches, take the next batch of tsvs out of the file queue.
74
+ else:
75
+ tsv_batch = tsvs[:batch_size]
76
+ tsvs = tsvs[batch_size:]
77
+ print('{0}: tsvs in batch {1} '.format(readwrite.time_string(), tsv_batch))
78
+ ###################################################
79
+
80
+ ###################################################
81
+ ### Add the tsvs as dataframes to a dictionary (dict_total) keyed by integer index. Also make modification specific dictionaries and strand specific dictionaries.
82
+ # Initialize dictionaries and place them in a list
83
+ dict_total, dict_a, dict_a_bottom, dict_a_top, dict_c, dict_c_bottom, dict_c_top, dict_combined_bottom, dict_combined_top = {},{},{},{},{},{},{},{},{}
84
+ dict_list = [dict_total, dict_a, dict_a_bottom, dict_a_top, dict_c, dict_c_bottom, dict_c_top, dict_combined_bottom, dict_combined_top]
85
+
86
+ # Give names to represent each dictionary in the list
87
+ sample_types = ['total', 'm6A', 'm6A_bottom_strand', 'm6A_top_strand', '5mC', '5mC_bottom_strand', '5mC_top_strand', 'combined_bottom_strand', 'combined_top_strand']
88
+
89
+ # Give indices of dictionaries to skip for analysis and final dictionary saving.
90
+ dict_to_skip = [0, 1, 4]
91
+ combined_dicts = [7, 8]
92
+ A_stranded_dicts = [2, 3]
93
+ C_stranded_dicts = [5, 6]
94
+ dict_to_skip = dict_to_skip + combined_dicts + A_stranded_dicts + C_stranded_dicts
95
+ dict_to_skip = set(dict_to_skip)
96
+
97
+ # Load the dict_total dictionary with all of the tsv files as dataframes.
98
+ for i, tsv in enumerate(tsv_batch):
99
+ print('{0}: Loading sample tsv {1} into dataframe'.format(readwrite.time_string(), tsv))
100
+ temp_df = pd.read_csv(tsv, sep='\t', header=0)
101
+ for record in records_to_analyze:
102
+ if record not in dict_total.keys():
103
+ dict_total[record] = {}
104
+ # Only keep the reads aligned to the chromosomes of interest
105
+ print('{0}: Filtering sample dataframe to keep chromosome of interest'.format(readwrite.time_string()))
106
+ dict_total[record][i] = temp_df[temp_df['chrom'] == record]
107
+ # Only keep the read positions that fall within the region of interest
108
+ print('{0}: Filtering sample dataframe to keep positions falling within region of interest'.format(readwrite.time_string()))
109
+ current_reference_length = reference_dict[record][0]
110
+ dict_total[record][i] = dict_total[record][i][(current_reference_length > dict_total[record][i]['ref_position']) & (dict_total[record][i]['ref_position']>= 0)]
111
+
112
+ # Iterate over dict_total of all the tsv files and extract the modification specific and strand specific dataframes into dictionaries
113
+ for record in dict_total.keys():
114
+ for i in dict_total[record].keys():
115
+ if '6mA' in mods:
116
+ # Remove Adenine stranded dicts from the dicts to skip set
117
+ dict_to_skip.difference_update(A_stranded_dicts)
118
+
119
+ if record not in dict_a.keys() and record not in dict_a_bottom.keys() and record not in dict_a_top.keys():
120
+ dict_a[record], dict_a_bottom[record], dict_a_top[record] = {}, {}, {}
121
+
122
+ # get a dictionary of dataframes that only contain methylated adenine positions
123
+ dict_a[record][i] = dict_total[record][i][dict_total[record][i]['modified_primary_base'] == 'A']
124
+ print('{}: Successfully created a methyl-adenine dictionary for '.format(readwrite.time_string()) + str(i))
125
+ # Stratify the adenine dictionary into two strand specific dictionaries.
126
+ dict_a_bottom[record][i] = dict_a[record][i][dict_a[record][i]['ref_strand'] == '-']
127
+ print('{}: Successfully created a minus strand methyl-adenine dictionary for '.format(readwrite.time_string()) + str(i))
128
+ dict_a_top[record][i] = dict_a[record][i][dict_a[record][i]['ref_strand'] == '+']
129
+ print('{}: Successfully created a plus strand methyl-adenine dictionary for '.format(readwrite.time_string()) + str(i))
130
+
131
+ if '5mC' in mods:
132
+ # Remove Cytosine stranded dicts from the dicts to skip set
133
+ dict_to_skip.difference_update(C_stranded_dicts)
134
+
135
+ if record not in dict_c.keys() and record not in dict_c_bottom.keys() and record not in dict_c_top.keys():
136
+ dict_c[record], dict_c_bottom[record], dict_c_top[record] = {}, {}, {}
137
+
138
+ # get a dictionary of dataframes that only contain methylated cytosine positions
139
+ dict_c[record][i] = dict_total[record][i][dict_total[record][i]['modified_primary_base'] == 'C']
140
+ print('{}: Successfully created a methyl-cytosine dictionary for '.format(readwrite.time_string()) + str(i))
141
+ # Stratify the cytosine dictionary into two strand specific dictionaries.
142
+ dict_c_bottom[record][i] = dict_c[record][i][dict_c[record][i]['ref_strand'] == '-']
143
+ print('{}: Successfully created a minus strand methyl-cytosine dictionary for '.format(readwrite.time_string()) + str(i))
144
+ dict_c_top[record][i] = dict_c[record][i][dict_c[record][i]['ref_strand'] == '+']
145
+ print('{}: Successfully created a plus strand methyl-cytosine dictionary for '.format(readwrite.time_string()) + str(i))
146
+ # In the strand specific dictionaries, only keep positions that are informative for GpC SMF
147
+
148
+ if '6mA' in mods and '5mC' in mods:
149
+ # Remove combined stranded dicts from the dicts to skip set
150
+ dict_to_skip.difference_update(combined_dicts)
151
+ # Initialize the sample keys for the combined dictionaries
152
+
153
+ if record not in dict_combined_bottom.keys() and record not in dict_combined_top.keys():
154
+ dict_combined_bottom[record], dict_combined_top[record]= {}, {}
155
+
156
+ print('{}: Successfully created a minus strand combined methylation dictionary for '.format(readwrite.time_string()) + str(i))
157
+ dict_combined_bottom[record][i] = []
158
+ print('{}: Successfully created a plus strand combined methylation dictionary for '.format(readwrite.time_string()) + str(i))
159
+ dict_combined_top[record][i] = []
160
+
161
+ # Iterate over the stranded modification dictionaries and replace the dataframes with a dictionary of read names pointing to a list of values from the dataframe
162
+ for i, dict_type in enumerate(dict_list):
163
+ # Only iterate over stranded dictionaries
164
+ if i not in dict_to_skip:
165
+ print('{0}: Extracting methylation states for {1} dictionary'.format(readwrite.time_string(), sample_types[i]))
166
+ for record in dict_type.keys():
167
+ # Get the dictionary for the modification type of interest from the reference mapping of interest
168
+ dict = dict_type[record]
169
+ print('{0}: Extracting methylation states for {1} dictionary'.format(readwrite.time_string(), record))
170
+ # For each sample in a stranded dictionary
171
+ for sample in dict.keys():
172
+ print('{0}: Extracting {1} dictionary from record {2} for sample {3}'.format(readwrite.time_string(), sample_types[i], record, sample))
173
+ # Load the combined bottom strand dictionary after all the individual dictionaries have been made for the sample
174
+ if i == 7:
175
+ # Load the minus strand dictionaries for each sample into temporary variables
176
+ temp_a_dict = dict_list[2][record][sample].copy()
177
+ temp_c_dict = dict_list[5][record][sample].copy()
178
+ dict[sample] = {}
179
+ # Iterate over the reads present in the merge of both dictionaries
180
+ for read in set(temp_a_dict) | set(temp_c_dict):
181
+ # Add the arrays element-wise if the read is present in both dictionaries
182
+ if read in temp_a_dict and read in temp_c_dict:
183
+ dict[sample][read] = np.nansum([temp_a_dict[read], temp_c_dict[read]], axis=0)
184
+ # If the read is present in only one dictionary, copy its value
185
+ elif read in temp_a_dict:
186
+ dict[sample][read] = temp_a_dict[read]
187
+ else:
188
+ dict[sample][read] = temp_c_dict[read]
189
+ # Load the combined top strand dictionary after all the individual dictionaries have been made for the sample
190
+ elif i == 8:
191
+ # Load the plus strand dictionaries for each sample into temporary variables
192
+ temp_a_dict = dict_list[3][record][sample].copy()
193
+ temp_c_dict = dict_list[6][record][sample].copy()
194
+ dict[sample] = {}
195
+ # Iterate over the reads present in the merge of both dictionaries
196
+ for read in set(temp_a_dict) | set(temp_c_dict):
197
+ # Add the arrays element-wise if the read is present in both dictionaries
198
+ if read in temp_a_dict and read in temp_c_dict:
199
+ dict[sample][read] = np.nansum([temp_a_dict[read], temp_c_dict[read]], axis=0)
200
+ # If the read is present in only one dictionary, copy its value
201
+ elif read in temp_a_dict:
202
+ dict[sample][read] = temp_a_dict[read]
203
+ else:
204
+ dict[sample][read] = temp_c_dict[read]
205
+ # For all other dictionaries
206
+ else:
207
+ # extract the dataframe from the dictionary into a temporary variable
208
+ temp_df = dict[sample]
209
+ # reassign the dictionary pointer to a nested dictionary.
210
+ dict[sample] = {}
211
+ # # Iterate through rows in the temp DataFrame
212
+ for index, row in temp_df.iterrows():
213
+ read = row['read_id'] # read name
214
+ position = row['ref_position'] # positional coordinate
215
+ probability = row['call_prob'] # Get the probability of the given call
216
+ # if the call_code is modified change methylated value to the probability of methylation
217
+ if (row['call_code'] in ['a', 'h', 'm']):
218
+ methylated = probability
219
+ # If the call code is canonical, change the methylated value to 1 - the probability of canonical
220
+ elif (row['call_code'] in ['-']):
221
+ methylated = 1 - probability
222
+
223
+ # If the current read is not in the dictionary yet, initalize the dictionary with a nan filled numpy array of proper size.
224
+ if read not in dict[sample]:
225
+ dict[sample][read] = np.full(max_reference_length, np.nan)
226
+ else:
227
+ pass
228
+ # add the positional methylation state to the numpy array
229
+ dict[sample][read][position-1] = methylated
230
+
231
+ # Save the sample files in the batch as gzipped hdf5 files
232
+ print('{0}: Converting batch {1} dictionaries to anndata objects'.format(readwrite.time_string(), batch))
233
+ for i, dict_type in enumerate(dict_list):
234
+ if i not in dict_to_skip:
235
+ # Initialize an hdf5 file for the current modified strand
236
+ adata = None
237
+ print('{0}: Converting {1} dictionary to an anndata object'.format(readwrite.time_string(), sample_types[i]))
238
+ for record in dict_type.keys():
239
+ # Get the dictionary for the modification type of interest from the reference mapping of interest
240
+ dict = dict_type[record]
241
+ for sample in dict.keys():
242
+ print('{0}: Converting {1} dictionary for sample {2} to an anndata object'.format(readwrite.time_string(), sample_types[i], sample))
243
+ sample = int(sample)
244
+ final_sample_index = sample + (batch * batch_size)
245
+ print('{0}: Final sample index for sample: {1}'.format(readwrite.time_string(), final_sample_index))
246
+ print('{0}: Converting {1} dictionary for sample {2} to a dataframe'.format(readwrite.time_string(), sample_types[i], final_sample_index))
247
+ temp_df = pd.DataFrame.from_dict(dict[sample], orient='index')
248
+ sorted_index = sorted(temp_df.index)
249
+ temp_df = temp_df.reindex(sorted_index)
250
+ X = temp_df.values
251
+ one_hot_encodings = record_seq_dict[record][0]
252
+ read_names = list(one_hot_encodings.keys())
253
+ sequence_length = one_hot_encodings[read_names[0]].shape[0]
254
+ dict_A, dict_C, dict_G, dict_T, dict_N = {}, {}, {}, {}, {}
255
+ # Loop through each read name and its corresponding one-hot array
256
+ print('{0}: Extracting one hot encodings into dictionaries'.format(readwrite.time_string()))
257
+ for read_name, one_hot_array in one_hot_encodings.items():
258
+ dict_A[read_name] = one_hot_array[:, 0]
259
+ dict_C[read_name] = one_hot_array[:, 1]
260
+ dict_G[read_name] = one_hot_array[:, 2]
261
+ dict_T[read_name] = one_hot_array[:, 3]
262
+ dict_N[read_name] = one_hot_array[:, 4]
263
+ # Load dfs with data from the dictionaries
264
+ print('{0}: Loading dataframes from one hot encoded dictionaries'.format(readwrite.time_string()))
265
+ df_A = pd.DataFrame.from_dict(dict_A, orient='index').reindex(sorted_index)
266
+ df_C = pd.DataFrame.from_dict(dict_C, orient='index').reindex(sorted_index)
267
+ df_G = pd.DataFrame.from_dict(dict_G, orient='index').reindex(sorted_index)
268
+ df_T = pd.DataFrame.from_dict(dict_T, orient='index').reindex(sorted_index)
269
+ df_N = pd.DataFrame.from_dict(dict_N, orient='index').reindex(sorted_index)
270
+
271
+ ohe_df_map = {0: df_A, 1: df_C, 2: df_G, 3: df_T, 4: df_N}
272
+
273
+ print('{0}: Loading {1} dataframe for sample {2} into a temp anndata object'.format(readwrite.time_string(), sample_types[i], final_sample_index))
274
+ temp_adata = ad.AnnData(X, dtype=X.dtype)
275
+ print('{0}: Adding read names and position ids to {1} anndata for sample {2}'.format(readwrite.time_string(), sample_types[i], final_sample_index))
276
+ temp_adata.obs_names = temp_df.index
277
+ temp_adata.obs_names = temp_adata.obs_names.astype(str)
278
+ temp_adata.var_names = temp_df.columns
279
+ temp_adata.var_names = temp_adata.var_names.astype(str)
280
+ print('{0}: Adding final sample id to {1} anndata for sample {2}'.format(readwrite.time_string(), sample_types[i], final_sample_index))
281
+ temp_adata.obs['Sample'] = [str(final_sample_index)] * len(temp_adata)
282
+ dataset, strand = sample_types[i].split('_')[:2]
283
+ temp_adata.obs['Strand'] = [strand] * len(temp_adata)
284
+ temp_adata.obs['Dataset'] = [dataset] * len(temp_adata)
285
+ temp_adata.obs['Reference'] = [f'{record}_{dataset}_{strand}'] * len(temp_adata)
286
+ temp_adata.obs['Reference_chromosome'] = [f'{record}'] * len(temp_adata)
287
+
288
+ for j, base in enumerate(['A', 'C', 'G', 'T', 'N']):
289
+ temp_adata.layers[f'{base}_binary_encoding'] = ohe_df_map[j].values
290
+
291
+ # If final adata object already has a sample loaded, concatenate the current sample into the existing adata object
292
+ if adata:
293
+ print('{0}: Concatenating {1} anndata object for sample {2}'.format(readwrite.time_string(), sample_types[i], final_sample_index))
294
+ adata = ad.concat([adata, temp_adata], join='outer', index_unique=None)
295
+ else:
296
+ print('{0}: Initializing {1} anndata object for sample {2}'.format(readwrite.time_string(), sample_types[i], final_sample_index))
297
+ adata = temp_adata
298
+
299
+ print('{0}: Writing {1} anndata out as a gzipped hdf5 file'.format(readwrite.time_string(), sample_types[i]))
300
+ adata.write_h5ad('{0}_{1}_{2}_SMF_binarized_sample_hdf5.h5ad.gz'.format(readwrite.date_string(), batch, sample_types[i]), compression='gzip')
301
+
302
+ # Delete the batch dictionaries from memory
303
+ del dict_list
304
+ gc.collect()
305
+
306
+ # Iterate over all of the batched hdf5 files and concatenate them.
307
+ files = os.listdir(os.getcwd())
308
+ # Name the final output file
309
+ final_hdf = '{0}_{1}_final_experiment_hdf5.h5ad.gz'.format(readwrite.date_string(), experiment_name)
310
+ # Filter file names that contain the search string in their filename and keep them in a list
311
+ hdfs = [hdf for hdf in files if 'hdf5.h5ad' in hdf and hdf != final_hdf]
312
+ # Sort file list by names and print the list of file names
313
+ hdfs.sort()
314
+ print('{0} sample files found: {1}'.format(len(hdfs), hdfs))
315
+ final_adata = None
316
+ for hdf in hdfs:
317
+ print('{0}: Reading in {1} hdf5 file'.format(readwrite.time_string(), hdf))
318
+ temp_adata = ad.read_h5ad(hdf)
319
+ if final_adata:
320
+ print('{0}: Concatenating final adata object with {1} hdf5 file'.format(readwrite.time_string(), hdf))
321
+ final_adata = ad.concat([final_adata, temp_adata], join='outer', index_unique=None)
322
+ else:
323
+ print('{0}: Initializing final adata object with {1} hdf5 file'.format(readwrite.time_string(), hdf))
324
+ final_adata = temp_adata
325
+ print('{0}: Writing final concatenated hdf5 file'.format(readwrite.time_string()))
326
+
327
+ for record in records_to_analyze:
328
+ # Add FASTA sequence to the object
329
+ sequence = record_seq_dict[record][1]
330
+ final_adata.uns[f'{record}_FASTA_sequence'] = sequence
331
+ final_adata.var[f'{record}_FASTA_sequence_base'] = list(sequence)
332
+
333
+ # Add consensus sequence of samples mapped to the record to the object
334
+ record_subset = final_adata[final_adata.obs['Reference_chromosome'] == record].copy()
335
+ layer_map, layer_counts = {}, []
336
+ for i, layer in enumerate(record_subset.layers):
337
+ layer_map[i] = layer.split('_')[0]
338
+ layer_counts.append(np.sum(record_subset.layers[layer], axis=0))
339
+ count_array = np.array(layer_counts)
340
+ nucleotide_indexes = np.argmax(count_array, axis=0)
341
+ consensus_sequence_list = [layer_map[i] for i in nucleotide_indexes]
342
+ final_adata.var[f'{record}_consensus_across_samples'] = consensus_sequence_list
343
+
344
+ final_adata.write_h5ad(final_hdf, compression='gzip')
345
+
346
+ # Delete the individual h5ad files and only keep the final concatenated file
347
+ files = os.listdir(os.getcwd())
348
+ hdfs_to_delete = [hdf for hdf in files if 'hdf5.h5ad' in hdf and hdf != final_hdf]
349
+ # Iterate over the files and delete them
350
+ for hdf in hdfs_to_delete:
351
+ try:
352
+ os.remove(hdf)
353
+ print(f"Deleted file: {hdf}")
354
+ except OSError as e:
355
+ print(f"Error deleting file {hdf}: {e}")
@@ -0,0 +1,14 @@
1
+ # one_hot_encode
2
+ from .. import readwrite
3
+
4
+ # String encodings
5
+ def one_hot_encode(sequence):
6
+ """
7
+ Input: A sequence string of a read.
8
+ Output: One hot encoding of the sequence string.
9
+ """
10
+ mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3, 'N': 4}
11
+ one_hot_matrix = np.zeros((len(sequence), 5), dtype=int)
12
+ for i, nucleotide in enumerate(sequence):
13
+ one_hot_matrix[i, mapping[nucleotide]] = 1
14
+ return one_hot_matrix
@@ -0,0 +1,28 @@
1
+ ## separate_bam_by_bc
2
+ import pysam
3
+
4
+ # General
5
+ def separate_bam_by_bc(input_bam, output_prefix):
6
+ """
7
+ Input: Takes a single BAM input. Also takes an output prefix to append to the output file.
8
+ Output: Splits the BAM based on the BC SAM tag value.
9
+ """
10
+ # Open the input BAM file for reading
11
+ with pysam.AlignmentFile(input_bam, "rb") as bam:
12
+ # Create a dictionary to store output BAM files
13
+ output_files = {}
14
+ # Iterate over each read in the BAM file
15
+ for read in bam:
16
+ try:
17
+ # Get the barcode tag value
18
+ bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
19
+ # Open the output BAM file corresponding to the barcode
20
+ if bc_tag not in output_files:
21
+ output_files[bc_tag] = pysam.AlignmentFile(f"{output_prefix}_{bc_tag}.bam", "wb", header=bam.header)
22
+ # Write the read to the corresponding output BAM file
23
+ output_files[bc_tag].write(read)
24
+ except KeyError:
25
+ print(f"BC tag not present for read: {read.query_name}")
26
+ # Close all output BAM files
27
+ for output_file in output_files.values():
28
+ output_file.close()
@@ -0,0 +1,21 @@
1
+ ## split_and_index_BAM
2
+ from .. import readwrite
3
+ import os
4
+ import subprocess
5
+ import glob
6
+ from .separate_bam_by_bc import separate_bam_by_bc
7
+
8
+ def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
9
+ """
10
+ A wrapper function for splitting BAMS and indexing them
11
+ """
12
+ os.chdir(split_dir)
13
+ aligned_sorted_output = aligned_sorted_BAM + bam_suffix
14
+ file_prefix = readwrite.datestring()
15
+ separate_bam_by_bc(aligned_sorted_output, file_prefix)
16
+ # Make a BAM index file for the BAMs in that directory
17
+ bam_pattern = '*' + bam_suffix
18
+ bam_files = glob.glob(os.path.join(split_dir, bam_pattern))
19
+ for input_file in bam_files:
20
+ subprocess.run(["samtools", "index", input_file])
21
+ print(f"Indexed {input_file}")
@@ -0,0 +1,26 @@
1
+ ## pod5_conversion
2
+ from .helpers import align_BAM, canoncall, converted_BAM_to_adata, generate_converted_FASTA, split_and_index_BAM
3
+ import subprocess
4
+
5
+ def pod5_conversion(fasta, output_directory, conversion_types, strands, model, pod5_dir, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix):
6
+ """
7
+ Converts a POD5 file from a nanopore conversion SMF experiment to an adata object
8
+ """
9
+ bam=f"{output_directory}/HAC_basecalls"
10
+ aligned_BAM=f"{bam}_aligned"
11
+ aligned_sorted_BAM=f"{aligned_BAM}_sorted"
12
+ # 1) Convert FASTA file
13
+ converted_FASTA=fasta.split('.fa')[0]+'_converted.fasta'
14
+ generate_converted_FASTA(fasta, conversion_types, strands, converted_FASTA)
15
+
16
+ # 2) Basecall from the input POD5 to generate a singular output BAM
17
+ canoncall(model, pod5_dir, barcode_kit, bam, bam_suffix)
18
+
19
+ # 3) Align the BAM to the converted reference FASTA and sort the bam on positional coordinates. Also make an index and a bed file of mapped reads
20
+ align_BAM(converted_FASTA, bam, bam_suffix)
21
+
22
+ ### 4) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory###
23
+ split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix)
24
+
25
+ # 5) Take the converted BAM and load it into an adata object.
26
+ converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experiment_name, conversion_types, bam_suffix)
@@ -0,0 +1,29 @@
1
+ ## pod5_direct
2
+ from .helpers import align_BAM, extract_mods, make_modbed, modcall, modkit_extract_to_adata, modQC, split_and_index_BAM
3
+
4
+ def pod5_direct(fasta, output_directory, mod_list, model, thresholds, pod5_dir, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix, batch_size):
5
+ """
6
+
7
+ """
8
+ bam=f"{output_directory}/HAC_mod_calls"
9
+ aligned_BAM=f"{bam}_aligned"
10
+ aligned_sorted_BAM=f"{aligned_BAM}_sorted"
11
+ mod_bed_dir=f"{output_directory}/split_mod_beds"
12
+ mod_tsv_dir=f"{output_directory}/split_mod_tsvs"
13
+
14
+ aligned_sorted_output = aligned_sorted_BAM + bam_suffix
15
+ mod_map = {'6mA': '6mA', '5mC_5hmC': '5mC'}
16
+ mods = [mod_map[mod] for mod in mod_list]
17
+
18
+ # 1) Basecall using dorado
19
+ modcall(model, pod5_dir, barcode_kit, mod_list, bam, bam_suffix)
20
+ # 2) Align the BAM to the converted reference FASTA. Also make an index and a bed file of mapped reads
21
+ align_BAM(fasta, bam, bam_suffix)
22
+ # 3) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory
23
+ split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix)
24
+ # 4) Using nanopore modkit to work with modified BAM files ###
25
+ modQC(aligned_sorted_output, thresholds) # get QC metrics for mod calls
26
+ make_modbed(aligned_sorted_output, thresholds, mod_bed_dir) # Generate bed files of position methylation summaries for every sample
27
+ extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix) # Extract methylations calls for split BAM files into split TSV files
28
+ #5 Load the modification data from TSVs into an adata object
29
+ modkit_extract_to_adata(fasta, aligned_sorted_output, mapping_threshold, experiment_name, mods, batch_size)
@@ -0,0 +1,17 @@
1
+ ## pod5_to_adata
2
+ from .helpers import load_experiment_config
3
+ from.pod5_direct import pod5_direct
4
+ from.pod5_conversion import pod5_conversion
5
+
6
+ def pod5_to_adata(config_path, ):
7
+ """
8
+
9
+ """
10
+ # Load experiment config parameters into global variables
11
+ load_experiment_config(config_path)
12
+ if smf_modality == 'conversion':
13
+ (fasta, output_directory, conversion_types, strands, model, pod5_dir, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix)
14
+ elif smf_modality == 'direct':
15
+ pod5_direct(fasta, output_directory, mod_list, model, thresholds, pod5_dir, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix, batch_size)
16
+ else:
17
+ print("Error")
@@ -0,0 +1,109 @@
1
+ ## readwrite ##
2
+
3
+ # Basic I/O
4
+ import os
5
+ # Datetime
6
+ from datetime import datetime
7
+ # Data structures and basic operations
8
+ import math
9
+ import numpy as np
10
+ import pandas as pd
11
+ import anndata as ad
12
+ import scipy.sparse as sp
13
+
14
+ # Runtime warnings
15
+ import warnings
16
+ warnings.filterwarnings('ignore', category=UserWarning, module='anndata')
17
+ warnings.filterwarnings('ignore', category=FutureWarning, module='anndata')
18
+
19
+ ######################################################################################################
20
+ ## Datetime functionality
21
+ def date_string():
22
+ """
23
+ Each time this is called, it returns the current date string
24
+ """
25
+ current_date = datetime.now()
26
+ date_string = current_date.strftime("%Y%m%d")
27
+ date_string = date_string[2:]
28
+ return date_string
29
+
30
+ def time_string():
31
+ """
32
+ Each time this is called, it returns the current time string
33
+ """
34
+ current_time = datetime.now()
35
+ return current_time.strftime("%H:%M:%S")
36
+ ######################################################################################################
37
+
38
+ ######################################################################################################
39
+ ## Numpy, Pandas, Anndata functionality
40
+ def adata_to_df(adata, layer=None):
41
+ """
42
+ Input: An adata object with a specified layer.
43
+ Output: A dataframe for the specific layer.
44
+ """
45
+ # Extract the data matrix from the given layer
46
+ if layer:
47
+ data_matrix = adata.layers[layer]
48
+ else:
49
+ data_matrix = adata.X
50
+ # Extract observation (read) annotations
51
+ obs_df = adata.obs
52
+ # Extract variable (position) annotations
53
+ var_df = adata.var
54
+ # Convert data matrix and annotations to pandas DataFrames
55
+ df = pd.DataFrame(data_matrix, index=obs_df.index, columns=var_df.index)
56
+ return df
57
+
58
+ def save_matrix(matrix, save_name):
59
+ """
60
+ Input: A numpy matrix and a save_name
61
+ Output: A txt file representation of the data matrix
62
+ """
63
+ np.savetxt(f'{save_name}.txt', matrix)
64
+
65
+ def concatenate_h5ads(output_file, file_suffix='h5ad.gz', delete_inputs=True):
66
+ """
67
+ Concatenate all h5ad files in a directory and delete them after the final adata is written out.
68
+ Input: an output file path relative to the directory in which the function is called
69
+ """
70
+ # List all files in the directory
71
+ files = os.listdir(os.getcwd())
72
+ # get current working directory
73
+ cwd = os.getcwd()
74
+ suffix = file_suffix
75
+ # Filter file names that contain the search string in their filename and keep them in a list
76
+ hdfs = [hdf for hdf in files if suffix in hdf]
77
+ # Sort file list by names and print the list of file names
78
+ hdfs.sort()
79
+ print('{0} sample files found: {1}'.format(len(hdfs), hdfs))
80
+ # Iterate over all of the hdf5 files and concatenate them.
81
+ final_adata = None
82
+ for hdf in hdfs:
83
+ print('{0}: Reading in {1} hdf5 file'.format(time_string(), hdf))
84
+ temp_adata = ad.read_h5ad(hdf)
85
+ if final_adata:
86
+ print('{0}: Concatenating final adata object with {1} hdf5 file'.format(time_string(), hdf))
87
+ final_adata = ad.concat([final_adata, temp_adata], join='outer', index_unique=None)
88
+ else:
89
+ print('{0}: Initializing final adata object with {1} hdf5 file'.format(time_string(), hdf))
90
+ final_adata = temp_adata
91
+ print('{0}: Writing final concatenated hdf5 file'.format(time_string()))
92
+ final_adata.write_h5ad(output_file, compression='gzip')
93
+
94
+ # Delete the individual h5ad files and only keep the final concatenated file
95
+ if delete_inputs:
96
+ files = os.listdir(os.getcwd())
97
+ hdfs = [hdf for hdf in files if suffix in hdf]
98
+ if output_file in hdfs:
99
+ hdfs.remove(output_file)
100
+ # Iterate over the files and delete them
101
+ for hdf in hdfs:
102
+ try:
103
+ os.remove(hdf)
104
+ print(f"Deleted file: {hdf}")
105
+ except OSError as e:
106
+ print(f"Error deleting file {hdf}: {e}")
107
+ else:
108
+ print('Keeping input files')
109
+ ######################################################################################################
File without changes
@@ -0,0 +1,35 @@
1
+ from .append_C_context import append_C_context
2
+ from .binarize_on_Youden import binarize_on_Youden
3
+ from .binary_layers_to_ohe import binary_layers_to_ohe
4
+ from .calculate_complexity import calculate_complexity
5
+ from .calculate_converted_read_methylation_stats import calculate_converted_read_methylation_stats
6
+ from .calculate_coverage import calculate_coverage
7
+ from .calculate_pairwise_hamming_distances import calculate_pairwise_hamming_distances
8
+ from .calculate_position_Youden import calculate_position_Youden
9
+ from .calculate_read_length_stats import calculate_read_length_stats
10
+ from .clean_NaN import clean_NaN
11
+ from .filter_converted_reads_on_methylation import filter_converted_reads_on_methylation
12
+ from .filter_reads_on_length import filter_reads_on_length
13
+ from .invert_adata import invert_adata
14
+ from .mark_duplicates import mark_duplicates
15
+ from .min_non_diagonal import min_non_diagonal
16
+ from .remove_duplicates import remove_duplicates
17
+
18
+ __all__ = [
19
+ "append_C_context",
20
+ "binarize_on_Youden",
21
+ "binary_layers_to_ohe",
22
+ "calculate_complexity",
23
+ "calculate_converted_read_methylation_stats",
24
+ "calculate_coverage",
25
+ "calculate_pairwise_hamming_distances",
26
+ "calculate_position_Youden",
27
+ "calculate_read_length_stats",
28
+ "clean_NaN",
29
+ "filter_converted_reads_on_methylation",
30
+ "filter_reads_on_length",
31
+ "invert_adata",
32
+ "mark_duplicates",
33
+ "min_non_diagonal",
34
+ "remove_duplicates"
35
+ ]