smftools 0.1.1__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. smftools-0.1.6.dist-info/METADATA +127 -0
  2. smftools-0.1.6.dist-info/RECORD +4 -0
  3. smftools/__init__.py +0 -25
  4. smftools/_settings.py +0 -19
  5. smftools/_version.py +0 -1
  6. smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
  7. smftools/datasets/__init__.py +0 -9
  8. smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
  9. smftools/datasets/datasets.py +0 -27
  10. smftools/informatics/__init__.py +0 -12
  11. smftools/informatics/bam_conversion.py +0 -47
  12. smftools/informatics/bam_direct.py +0 -49
  13. smftools/informatics/basecalls_to_adata.py +0 -42
  14. smftools/informatics/fast5_to_pod5.py +0 -19
  15. smftools/informatics/helpers/LoadExperimentConfig.py +0 -74
  16. smftools/informatics/helpers/__init__.py +0 -42
  17. smftools/informatics/helpers/align_and_sort_BAM.py +0 -52
  18. smftools/informatics/helpers/archived/informatics.py +0 -260
  19. smftools/informatics/helpers/archived/load_adata.py +0 -516
  20. smftools/informatics/helpers/binarize_converted_base_identities.py +0 -31
  21. smftools/informatics/helpers/canoncall.py +0 -23
  22. smftools/informatics/helpers/converted_BAM_to_adata.py +0 -164
  23. smftools/informatics/helpers/count_aligned_reads.py +0 -39
  24. smftools/informatics/helpers/extract_base_identities.py +0 -43
  25. smftools/informatics/helpers/extract_mods.py +0 -51
  26. smftools/informatics/helpers/find_conversion_sites.py +0 -59
  27. smftools/informatics/helpers/generate_converted_FASTA.py +0 -79
  28. smftools/informatics/helpers/get_native_references.py +0 -28
  29. smftools/informatics/helpers/make_dirs.py +0 -21
  30. smftools/informatics/helpers/make_modbed.py +0 -27
  31. smftools/informatics/helpers/modQC.py +0 -27
  32. smftools/informatics/helpers/modcall.py +0 -26
  33. smftools/informatics/helpers/modkit_extract_to_adata.py +0 -367
  34. smftools/informatics/helpers/one_hot_encode.py +0 -19
  35. smftools/informatics/helpers/separate_bam_by_bc.py +0 -41
  36. smftools/informatics/helpers/split_and_index_BAM.py +0 -29
  37. smftools/informatics/pod5_conversion.py +0 -53
  38. smftools/informatics/pod5_direct.py +0 -55
  39. smftools/informatics/pod5_to_adata.py +0 -40
  40. smftools/informatics/readwrite.py +0 -106
  41. smftools/informatics/subsample_pod5.py +0 -48
  42. smftools/plotting/__init__.py +0 -0
  43. smftools/preprocessing/__init__.py +0 -29
  44. smftools/preprocessing/append_C_context.py +0 -46
  45. smftools/preprocessing/archives/preprocessing.py +0 -614
  46. smftools/preprocessing/binarize_on_Youden.py +0 -42
  47. smftools/preprocessing/binary_layers_to_ohe.py +0 -30
  48. smftools/preprocessing/calculate_complexity.py +0 -71
  49. smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -45
  50. smftools/preprocessing/calculate_coverage.py +0 -41
  51. smftools/preprocessing/calculate_pairwise_hamming_distances.py +0 -27
  52. smftools/preprocessing/calculate_position_Youden.py +0 -104
  53. smftools/preprocessing/calculate_read_length_stats.py +0 -32
  54. smftools/preprocessing/clean_NaN.py +0 -38
  55. smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -27
  56. smftools/preprocessing/filter_reads_on_length.py +0 -39
  57. smftools/preprocessing/invert_adata.py +0 -22
  58. smftools/preprocessing/mark_duplicates.py +0 -119
  59. smftools/preprocessing/min_non_diagonal.py +0 -25
  60. smftools/preprocessing/remove_duplicates.py +0 -18
  61. smftools/readwrite.py +0 -106
  62. smftools/tools/__init__.py +0 -0
  63. smftools-0.1.1.dist-info/METADATA +0 -88
  64. smftools-0.1.1.dist-info/RECORD +0 -64
  65. {smftools-0.1.1.dist-info → smftools-0.1.6.dist-info}/WHEEL +0 -0
  66. {smftools-0.1.1.dist-info → smftools-0.1.6.dist-info}/licenses/LICENSE +0 -0
@@ -1,367 +0,0 @@
1
- ## modkit_extract_to_adata
2
-
3
- def modkit_extract_to_adata(fasta, bam, mapping_threshold, experiment_name, mods, batch_size):
4
- """
5
- Takes modkit extract outputs and organizes it into an adata object
6
-
7
- Parameters:
8
- fasta (str): File path to the reference genome to align to.
9
- bam (str): File path to the aligned_sorted non-split modified BAM file
10
- mapping_threshold (float): A value in between 0 and 1 to threshold the minimal fraction of aligned reads which map to the reference region. References with values above the threshold are included in the output adata.
11
- experiment_name (str): A string to provide an experiment name to the output adata file.
12
- mods (list): A list of strings of the modification types to use in the analysis.
13
- batch_size (int): An integer number of TSV files to analyze in memory at once while loading the final adata object.
14
-
15
- Returns:
16
- None
17
- """
18
- from .. import readwrite
19
- from .get_native_references import get_native_references
20
- from .count_aligned_reads import count_aligned_reads
21
- from .extract_base_identities import extract_base_identities
22
- from .one_hot_encode import one_hot_encode
23
- import pandas as pd
24
- import anndata as ad
25
- import os
26
- import gc
27
- import math
28
- import numpy as np
29
- ###################################################
30
- ### Get input tsv file names into a sorted list ###
31
- # List all files in the directory
32
- files = os.listdir(os.getcwd())
33
- # get current working directory
34
- cwd = os.getcwd()
35
- # Filter file names that contain the search string in their filename and keep them in a list
36
- tsvs = [tsv for tsv in files if 'extract.tsv' in tsv]
37
- # Sort file list by names and print the list of file names
38
- tsvs.sort()
39
- print(f'{len(tsvs)} sample tsv files found: {tsvs}')
40
- print(f'sample bam file found: {bam}')
41
-
42
- # Get all references within the FASTA and indicate the length and identity of the record sequence
43
- max_reference_length = 0
44
- reference_dict = get_native_references(fasta)
45
- for record in reference_dict.keys():
46
- if reference_dict[record][0] > max_reference_length:
47
- max_reference_length = reference_dict[record][0]
48
-
49
- print(f'{readwrite.time_string()}: Max reference length in dataset: {max_reference_length}')
50
- batches = math.ceil(len(tsvs) / batch_size) # Number of batches to process
51
- print('{0}: Processing input tsvs in {1} batches of {2} tsvs '.format(readwrite.time_string(), batches, batch_size))
52
-
53
- # look at aligned read proportions in the bam
54
- aligned_reads_count, unaligned_reads_count, record_counts = count_aligned_reads(bam)
55
- print('{} percent of reads in bam aligned successfully'.format(aligned_reads_count*100 / (aligned_reads_count+unaligned_reads_count)))
56
- records_to_analyze = []
57
- # Iterate over references and decide which to use in the analysis based on the mapping_threshold
58
- for record in record_counts:
59
- print('{0} reads mapped to reference record {1}. This is {2} percent of all mapped reads'.format(record_counts[record][0], record, record_counts[record][1]*100))
60
- if record_counts[record][1] >= mapping_threshold:
61
- records_to_analyze.append(record)
62
- print(f'Records to analyze: {records_to_analyze}')
63
- # Iterate over records to analyze and return a dictionary keyed by the reference name that points to another dictionary keyed by read names that map to that reference. This internal dictionary points to a one-hot encoding of the mapped read
64
- record_seq_dict = {}
65
- for record in records_to_analyze:
66
- current_reference_length = reference_dict[record][0]
67
- delta_max_length = max_reference_length - current_reference_length
68
- sequence = reference_dict[record][1] + 'N'*delta_max_length
69
- # Get a dictionary of positional base identities keyed by read id
70
- positions = range(current_reference_length)
71
- base_identities = extract_base_identities(bam, record, positions, max_reference_length)
72
- # One hot encode the sequence string of the reads
73
- one_hot_reads = {read_name: one_hot_encode(seq) for read_name, seq in base_identities.items()}
74
- record_seq_dict[record] = (one_hot_reads, sequence)
75
-
76
- ###################################################
77
-
78
- ###################################################
79
- # Begin iterating over batches
80
- for batch in range(batches):
81
- print('{0}: Processing tsvs for batch {1} '.format(readwrite.time_string(), batch))
82
- # For the final batch, just take the remaining tsv files
83
- if batch == batches - 1:
84
- tsv_batch = tsvs
85
- # For all other batches, take the next batch of tsvs out of the file queue.
86
- else:
87
- tsv_batch = tsvs[:batch_size]
88
- tsvs = tsvs[batch_size:]
89
- print('{0}: tsvs in batch {1} '.format(readwrite.time_string(), tsv_batch))
90
- ###################################################
91
-
92
- ###################################################
93
- ### Add the tsvs as dataframes to a dictionary (dict_total) keyed by integer index. Also make modification specific dictionaries and strand specific dictionaries.
94
- # Initialize dictionaries and place them in a list
95
- dict_total, dict_a, dict_a_bottom, dict_a_top, dict_c, dict_c_bottom, dict_c_top, dict_combined_bottom, dict_combined_top = {},{},{},{},{},{},{},{},{}
96
- dict_list = [dict_total, dict_a, dict_a_bottom, dict_a_top, dict_c, dict_c_bottom, dict_c_top, dict_combined_bottom, dict_combined_top]
97
-
98
- # Give names to represent each dictionary in the list
99
- sample_types = ['total', 'm6A', 'm6A_bottom_strand', 'm6A_top_strand', '5mC', '5mC_bottom_strand', '5mC_top_strand', 'combined_bottom_strand', 'combined_top_strand']
100
-
101
- # Give indices of dictionaries to skip for analysis and final dictionary saving.
102
- dict_to_skip = [0, 1, 4]
103
- combined_dicts = [7, 8]
104
- A_stranded_dicts = [2, 3]
105
- C_stranded_dicts = [5, 6]
106
- dict_to_skip = dict_to_skip + combined_dicts + A_stranded_dicts + C_stranded_dicts
107
- dict_to_skip = set(dict_to_skip)
108
-
109
- # Load the dict_total dictionary with all of the tsv files as dataframes.
110
- for i, tsv in enumerate(tsv_batch):
111
- print('{0}: Loading sample tsv {1} into dataframe'.format(readwrite.time_string(), tsv))
112
- temp_df = pd.read_csv(tsv, sep='\t', header=0)
113
- for record in records_to_analyze:
114
- if record not in dict_total.keys():
115
- dict_total[record] = {}
116
- # Only keep the reads aligned to the chromosomes of interest
117
- print('{0}: Filtering sample dataframe to keep chromosome of interest'.format(readwrite.time_string()))
118
- dict_total[record][i] = temp_df[temp_df['chrom'] == record]
119
- # Only keep the read positions that fall within the region of interest
120
- print('{0}: Filtering sample dataframe to keep positions falling within region of interest'.format(readwrite.time_string()))
121
- current_reference_length = reference_dict[record][0]
122
- dict_total[record][i] = dict_total[record][i][(current_reference_length > dict_total[record][i]['ref_position']) & (dict_total[record][i]['ref_position']>= 0)]
123
-
124
- # Iterate over dict_total of all the tsv files and extract the modification specific and strand specific dataframes into dictionaries
125
- for record in dict_total.keys():
126
- for i in dict_total[record].keys():
127
- if '6mA' in mods:
128
- # Remove Adenine stranded dicts from the dicts to skip set
129
- dict_to_skip.difference_update(A_stranded_dicts)
130
-
131
- if record not in dict_a.keys() and record not in dict_a_bottom.keys() and record not in dict_a_top.keys():
132
- dict_a[record], dict_a_bottom[record], dict_a_top[record] = {}, {}, {}
133
-
134
- # get a dictionary of dataframes that only contain methylated adenine positions
135
- dict_a[record][i] = dict_total[record][i][dict_total[record][i]['modified_primary_base'] == 'A']
136
- print('{}: Successfully created a methyl-adenine dictionary for '.format(readwrite.time_string()) + str(i))
137
- # Stratify the adenine dictionary into two strand specific dictionaries.
138
- dict_a_bottom[record][i] = dict_a[record][i][dict_a[record][i]['ref_strand'] == '-']
139
- print('{}: Successfully created a minus strand methyl-adenine dictionary for '.format(readwrite.time_string()) + str(i))
140
- dict_a_top[record][i] = dict_a[record][i][dict_a[record][i]['ref_strand'] == '+']
141
- print('{}: Successfully created a plus strand methyl-adenine dictionary for '.format(readwrite.time_string()) + str(i))
142
-
143
- if '5mC' in mods:
144
- # Remove Cytosine stranded dicts from the dicts to skip set
145
- dict_to_skip.difference_update(C_stranded_dicts)
146
-
147
- if record not in dict_c.keys() and record not in dict_c_bottom.keys() and record not in dict_c_top.keys():
148
- dict_c[record], dict_c_bottom[record], dict_c_top[record] = {}, {}, {}
149
-
150
- # get a dictionary of dataframes that only contain methylated cytosine positions
151
- dict_c[record][i] = dict_total[record][i][dict_total[record][i]['modified_primary_base'] == 'C']
152
- print('{}: Successfully created a methyl-cytosine dictionary for '.format(readwrite.time_string()) + str(i))
153
- # Stratify the cytosine dictionary into two strand specific dictionaries.
154
- dict_c_bottom[record][i] = dict_c[record][i][dict_c[record][i]['ref_strand'] == '-']
155
- print('{}: Successfully created a minus strand methyl-cytosine dictionary for '.format(readwrite.time_string()) + str(i))
156
- dict_c_top[record][i] = dict_c[record][i][dict_c[record][i]['ref_strand'] == '+']
157
- print('{}: Successfully created a plus strand methyl-cytosine dictionary for '.format(readwrite.time_string()) + str(i))
158
- # In the strand specific dictionaries, only keep positions that are informative for GpC SMF
159
-
160
- if '6mA' in mods and '5mC' in mods:
161
- # Remove combined stranded dicts from the dicts to skip set
162
- dict_to_skip.difference_update(combined_dicts)
163
- # Initialize the sample keys for the combined dictionaries
164
-
165
- if record not in dict_combined_bottom.keys() and record not in dict_combined_top.keys():
166
- dict_combined_bottom[record], dict_combined_top[record]= {}, {}
167
-
168
- print('{}: Successfully created a minus strand combined methylation dictionary for '.format(readwrite.time_string()) + str(i))
169
- dict_combined_bottom[record][i] = []
170
- print('{}: Successfully created a plus strand combined methylation dictionary for '.format(readwrite.time_string()) + str(i))
171
- dict_combined_top[record][i] = []
172
-
173
- # Iterate over the stranded modification dictionaries and replace the dataframes with a dictionary of read names pointing to a list of values from the dataframe
174
- for i, dict_type in enumerate(dict_list):
175
- # Only iterate over stranded dictionaries
176
- if i not in dict_to_skip:
177
- print('{0}: Extracting methylation states for {1} dictionary'.format(readwrite.time_string(), sample_types[i]))
178
- for record in dict_type.keys():
179
- # Get the dictionary for the modification type of interest from the reference mapping of interest
180
- dict = dict_type[record]
181
- print('{0}: Extracting methylation states for {1} dictionary'.format(readwrite.time_string(), record))
182
- # For each sample in a stranded dictionary
183
- for sample in dict.keys():
184
- print('{0}: Extracting {1} dictionary from record {2} for sample {3}'.format(readwrite.time_string(), sample_types[i], record, sample))
185
- # Load the combined bottom strand dictionary after all the individual dictionaries have been made for the sample
186
- if i == 7:
187
- # Load the minus strand dictionaries for each sample into temporary variables
188
- temp_a_dict = dict_list[2][record][sample].copy()
189
- temp_c_dict = dict_list[5][record][sample].copy()
190
- dict[sample] = {}
191
- # Iterate over the reads present in the merge of both dictionaries
192
- for read in set(temp_a_dict) | set(temp_c_dict):
193
- # Add the arrays element-wise if the read is present in both dictionaries
194
- if read in temp_a_dict and read in temp_c_dict:
195
- dict[sample][read] = np.nansum([temp_a_dict[read], temp_c_dict[read]], axis=0)
196
- # If the read is present in only one dictionary, copy its value
197
- elif read in temp_a_dict:
198
- dict[sample][read] = temp_a_dict[read]
199
- else:
200
- dict[sample][read] = temp_c_dict[read]
201
- # Load the combined top strand dictionary after all the individual dictionaries have been made for the sample
202
- elif i == 8:
203
- # Load the plus strand dictionaries for each sample into temporary variables
204
- temp_a_dict = dict_list[3][record][sample].copy()
205
- temp_c_dict = dict_list[6][record][sample].copy()
206
- dict[sample] = {}
207
- # Iterate over the reads present in the merge of both dictionaries
208
- for read in set(temp_a_dict) | set(temp_c_dict):
209
- # Add the arrays element-wise if the read is present in both dictionaries
210
- if read in temp_a_dict and read in temp_c_dict:
211
- dict[sample][read] = np.nansum([temp_a_dict[read], temp_c_dict[read]], axis=0)
212
- # If the read is present in only one dictionary, copy its value
213
- elif read in temp_a_dict:
214
- dict[sample][read] = temp_a_dict[read]
215
- else:
216
- dict[sample][read] = temp_c_dict[read]
217
- # For all other dictionaries
218
- else:
219
- # extract the dataframe from the dictionary into a temporary variable
220
- temp_df = dict[sample]
221
- # reassign the dictionary pointer to a nested dictionary.
222
- dict[sample] = {}
223
- # # Iterate through rows in the temp DataFrame
224
- for index, row in temp_df.iterrows():
225
- read = row['read_id'] # read name
226
- position = row['ref_position'] # positional coordinate
227
- probability = row['call_prob'] # Get the probability of the given call
228
- # if the call_code is modified change methylated value to the probability of methylation
229
- if (row['call_code'] in ['a', 'h', 'm']):
230
- methylated = probability
231
- # If the call code is canonical, change the methylated value to 1 - the probability of canonical
232
- elif (row['call_code'] in ['-']):
233
- methylated = 1 - probability
234
-
235
- # If the current read is not in the dictionary yet, initalize the dictionary with a nan filled numpy array of proper size.
236
- if read not in dict[sample]:
237
- dict[sample][read] = np.full(max_reference_length, np.nan)
238
- else:
239
- pass
240
- # add the positional methylation state to the numpy array
241
- dict[sample][read][position-1] = methylated
242
-
243
- # Save the sample files in the batch as gzipped hdf5 files
244
- print('{0}: Converting batch {1} dictionaries to anndata objects'.format(readwrite.time_string(), batch))
245
- for i, dict_type in enumerate(dict_list):
246
- if i not in dict_to_skip:
247
- # Initialize an hdf5 file for the current modified strand
248
- adata = None
249
- print('{0}: Converting {1} dictionary to an anndata object'.format(readwrite.time_string(), sample_types[i]))
250
- for record in dict_type.keys():
251
- # Get the dictionary for the modification type of interest from the reference mapping of interest
252
- dict = dict_type[record]
253
- for sample in dict.keys():
254
- print('{0}: Converting {1} dictionary for sample {2} to an anndata object'.format(readwrite.time_string(), sample_types[i], sample))
255
- sample = int(sample)
256
- final_sample_index = sample + (batch * batch_size)
257
- print('{0}: Final sample index for sample: {1}'.format(readwrite.time_string(), final_sample_index))
258
- print('{0}: Converting {1} dictionary for sample {2} to a dataframe'.format(readwrite.time_string(), sample_types[i], final_sample_index))
259
- temp_df = pd.DataFrame.from_dict(dict[sample], orient='index')
260
- sorted_index = sorted(temp_df.index)
261
- temp_df = temp_df.reindex(sorted_index)
262
- X = temp_df.values
263
- one_hot_encodings = record_seq_dict[record][0]
264
- read_names = list(one_hot_encodings.keys())
265
- sequence_length = one_hot_encodings[read_names[0]].shape[0]
266
- dict_A, dict_C, dict_G, dict_T, dict_N = {}, {}, {}, {}, {}
267
- # Loop through each read name and its corresponding one-hot array
268
- print('{0}: Extracting one hot encodings into dictionaries'.format(readwrite.time_string()))
269
- for read_name, one_hot_array in one_hot_encodings.items():
270
- dict_A[read_name] = one_hot_array[:, 0]
271
- dict_C[read_name] = one_hot_array[:, 1]
272
- dict_G[read_name] = one_hot_array[:, 2]
273
- dict_T[read_name] = one_hot_array[:, 3]
274
- dict_N[read_name] = one_hot_array[:, 4]
275
- # Load dfs with data from the dictionaries
276
- print('{0}: Loading dataframes from one hot encoded dictionaries'.format(readwrite.time_string()))
277
- df_A = pd.DataFrame.from_dict(dict_A, orient='index').reindex(sorted_index)
278
- df_C = pd.DataFrame.from_dict(dict_C, orient='index').reindex(sorted_index)
279
- df_G = pd.DataFrame.from_dict(dict_G, orient='index').reindex(sorted_index)
280
- df_T = pd.DataFrame.from_dict(dict_T, orient='index').reindex(sorted_index)
281
- df_N = pd.DataFrame.from_dict(dict_N, orient='index').reindex(sorted_index)
282
-
283
- ohe_df_map = {0: df_A, 1: df_C, 2: df_G, 3: df_T, 4: df_N}
284
-
285
- print('{0}: Loading {1} dataframe for sample {2} into a temp anndata object'.format(readwrite.time_string(), sample_types[i], final_sample_index))
286
- temp_adata = ad.AnnData(X, dtype=X.dtype)
287
- print('{0}: Adding read names and position ids to {1} anndata for sample {2}'.format(readwrite.time_string(), sample_types[i], final_sample_index))
288
- temp_adata.obs_names = temp_df.index
289
- temp_adata.obs_names = temp_adata.obs_names.astype(str)
290
- temp_adata.var_names = temp_df.columns
291
- temp_adata.var_names = temp_adata.var_names.astype(str)
292
- print('{0}: Adding final sample id to {1} anndata for sample {2}'.format(readwrite.time_string(), sample_types[i], final_sample_index))
293
- temp_adata.obs['Sample'] = [str(final_sample_index)] * len(temp_adata)
294
- dataset, strand = sample_types[i].split('_')[:2]
295
- temp_adata.obs['Strand'] = [strand] * len(temp_adata)
296
- temp_adata.obs['Dataset'] = [dataset] * len(temp_adata)
297
- temp_adata.obs['Reference'] = [f'{record}_{dataset}_{strand}'] * len(temp_adata)
298
- temp_adata.obs['Reference_chromosome'] = [f'{record}'] * len(temp_adata)
299
-
300
- for j, base in enumerate(['A', 'C', 'G', 'T', 'N']):
301
- temp_adata.layers[f'{base}_binary_encoding'] = ohe_df_map[j].values
302
-
303
- # If final adata object already has a sample loaded, concatenate the current sample into the existing adata object
304
- if adata:
305
- print('{0}: Concatenating {1} anndata object for sample {2}'.format(readwrite.time_string(), sample_types[i], final_sample_index))
306
- adata = ad.concat([adata, temp_adata], join='outer', index_unique=None)
307
- else:
308
- print('{0}: Initializing {1} anndata object for sample {2}'.format(readwrite.time_string(), sample_types[i], final_sample_index))
309
- adata = temp_adata
310
-
311
- print('{0}: Writing {1} anndata out as a gzipped hdf5 file'.format(readwrite.time_string(), sample_types[i]))
312
- adata.write_h5ad('{0}_{1}_{2}_SMF_binarized_sample_hdf5.h5ad.gz'.format(readwrite.date_string(), batch, sample_types[i]), compression='gzip')
313
-
314
- # Delete the batch dictionaries from memory
315
- del dict_list
316
- gc.collect()
317
-
318
- # Iterate over all of the batched hdf5 files and concatenate them.
319
- files = os.listdir(os.getcwd())
320
- # Name the final output file
321
- final_hdf = '{0}_{1}_final_experiment_hdf5.h5ad.gz'.format(readwrite.date_string(), experiment_name)
322
- # Filter file names that contain the search string in their filename and keep them in a list
323
- hdfs = [hdf for hdf in files if 'hdf5.h5ad' in hdf and hdf != final_hdf]
324
- # Sort file list by names and print the list of file names
325
- hdfs.sort()
326
- print('{0} sample files found: {1}'.format(len(hdfs), hdfs))
327
- final_adata = None
328
- for hdf in hdfs:
329
- print('{0}: Reading in {1} hdf5 file'.format(readwrite.time_string(), hdf))
330
- temp_adata = ad.read_h5ad(hdf)
331
- if final_adata:
332
- print('{0}: Concatenating final adata object with {1} hdf5 file'.format(readwrite.time_string(), hdf))
333
- final_adata = ad.concat([final_adata, temp_adata], join='outer', index_unique=None)
334
- else:
335
- print('{0}: Initializing final adata object with {1} hdf5 file'.format(readwrite.time_string(), hdf))
336
- final_adata = temp_adata
337
- print('{0}: Writing final concatenated hdf5 file'.format(readwrite.time_string()))
338
-
339
- for record in records_to_analyze:
340
- # Add FASTA sequence to the object
341
- sequence = record_seq_dict[record][1]
342
- final_adata.uns[f'{record}_FASTA_sequence'] = sequence
343
- final_adata.var[f'{record}_FASTA_sequence_base'] = list(sequence)
344
-
345
- # Add consensus sequence of samples mapped to the record to the object
346
- record_subset = final_adata[final_adata.obs['Reference_chromosome'] == record].copy()
347
- layer_map, layer_counts = {}, []
348
- for i, layer in enumerate(record_subset.layers):
349
- layer_map[i] = layer.split('_')[0]
350
- layer_counts.append(np.sum(record_subset.layers[layer], axis=0))
351
- count_array = np.array(layer_counts)
352
- nucleotide_indexes = np.argmax(count_array, axis=0)
353
- consensus_sequence_list = [layer_map[i] for i in nucleotide_indexes]
354
- final_adata.var[f'{record}_consensus_across_samples'] = consensus_sequence_list
355
-
356
- final_adata.write_h5ad(final_hdf, compression='gzip')
357
-
358
- # Delete the individual h5ad files and only keep the final concatenated file
359
- files = os.listdir(os.getcwd())
360
- hdfs_to_delete = [hdf for hdf in files if 'hdf5.h5ad' in hdf and hdf != final_hdf]
361
- # Iterate over the files and delete them
362
- for hdf in hdfs_to_delete:
363
- try:
364
- os.remove(hdf)
365
- print(f"Deleted file: {hdf}")
366
- except OSError as e:
367
- print(f"Error deleting file {hdf}: {e}")
@@ -1,19 +0,0 @@
1
- # one_hot_encode
2
-
3
- # String encodings
4
- def one_hot_encode(sequence):
5
- """
6
- One hot encodes a sequence string.
7
- Parameters:
8
- sequence (str): A DNA sequence string.
9
-
10
- Returns:
11
- one_hot_matrix (ndarray): A numpy ndarray holding a vstacked one hot encoding of the input sequence string.
12
- """
13
- import numpy as np
14
-
15
- mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3, 'N': 4}
16
- one_hot_matrix = np.zeros((len(sequence), 5), dtype=int)
17
- for i, nucleotide in enumerate(sequence):
18
- one_hot_matrix[i, mapping[nucleotide]] = 1
19
- return one_hot_matrix
@@ -1,41 +0,0 @@
1
- ## separate_bam_by_bc
2
-
3
- # General
4
- def separate_bam_by_bc(input_bam, output_prefix, bam_suffix):
5
- """
6
- Separates an input BAM file on the BC SAM tag values.
7
-
8
- Parameters:
9
- input_bam (str): File path to the BAM file to split.
10
- output_prefix (str): A prefix to append to the output BAM.
11
- bam_suffix (str): A suffix to add to the bam file.
12
-
13
- Returns:
14
- None
15
- Writes out split BAM files.
16
- """
17
- import pysam
18
- import os
19
-
20
- bam_base = os.path.basename(input_bam)
21
- bam_base_minus_suffix = bam_base.split(bam_suffix)[0]
22
-
23
- # Open the input BAM file for reading
24
- with pysam.AlignmentFile(input_bam, "rb") as bam:
25
- # Create a dictionary to store output BAM files
26
- output_files = {}
27
- # Iterate over each read in the BAM file
28
- for read in bam:
29
- try:
30
- # Get the barcode tag value
31
- bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
32
- # Open the output BAM file corresponding to the barcode
33
- if bc_tag not in output_files:
34
- output_files[bc_tag] = pysam.AlignmentFile(f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}", "wb", header=bam.header)
35
- # Write the read to the corresponding output BAM file
36
- output_files[bc_tag].write(read)
37
- except KeyError:
38
- print(f"BC tag not present for read: {read.query_name}")
39
- # Close all output BAM files
40
- for output_file in output_files.values():
41
- output_file.close()
@@ -1,29 +0,0 @@
1
- ## split_and_index_BAM
2
-
3
- def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
4
- """
5
- A wrapper function for splitting BAMS and indexing them.
6
- Parameters:
7
- aligned_sorted_BAM (str): A string representing the file path of the aligned_sorted BAM file.
8
- split_dir (str): A string representing the file path to the directory to split the BAMs into.
9
- bam_suffix (str): A suffix to add to the bam file.
10
-
11
- Returns:
12
- None
13
- Splits an input BAM file on barcode value and makes a BAM index file.
14
- """
15
- from .. import readwrite
16
- import os
17
- import subprocess
18
- import glob
19
- from .separate_bam_by_bc import separate_bam_by_bc
20
-
21
- os.chdir(split_dir)
22
- aligned_sorted_output = aligned_sorted_BAM + bam_suffix
23
- file_prefix = readwrite.date_string()
24
- separate_bam_by_bc(aligned_sorted_output, file_prefix, bam_suffix)
25
- # Make a BAM index file for the BAMs in that directory
26
- bam_pattern = '*' + bam_suffix
27
- bam_files = glob.glob(os.path.join(split_dir, bam_pattern))
28
- for input_file in bam_files:
29
- subprocess.run(["samtools", "index", input_file])
@@ -1,53 +0,0 @@
1
- ## pod5_conversion
2
-
3
- def pod5_conversion(fasta, output_directory, conversion_types, strands, model, pod5_dir, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix):
4
- """
5
- Converts a POD5 file from a nanopore conversion SMF experiment to an adata object.
6
-
7
- Parameters:
8
- fasta (str): File path to the reference genome to align to.
9
- output_directory (str): A file path to the directory to output all the analyses.
10
- conversion_type (list): A list of strings of the conversion types to use in the analysis.
11
- strands (list): A list of converstion strands to use in the experiment.
12
- model (str): a string representing the file path to the dorado basecalling model.
13
- pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
14
- split_dir (str): A string representing the file path to the directory to split the BAMs into.
15
- barcode_kit (str): A string representing the barcoding kit used in the experiment.
16
- mapping_threshold (float): A value in between 0 and 1 to threshold the minimal fraction of aligned reads which map to the reference region. References with values above the threshold are included in the output adata.
17
- experiment_name (str): A string to provide an experiment name to the output adata file.
18
- bam_suffix (str): A suffix to add to the bam file.
19
-
20
- Returns:
21
- None
22
- """
23
- from .helpers import align_and_sort_BAM, canoncall, converted_BAM_to_adata, generate_converted_FASTA, split_and_index_BAM
24
- import os
25
- model_basename = os.path.basename(model)
26
- model_basename = model_basename.replace('.', '_')
27
- bam=f"{output_directory}/{model_basename}_canonical_basecalls"
28
- aligned_BAM=f"{bam}_aligned"
29
- aligned_sorted_BAM=f"{aligned_BAM}_sorted"
30
-
31
- os.chdir(output_directory)
32
-
33
- # 1) Convert FASTA file
34
- fasta_basename = os.path.basename(fasta)
35
- converted_FASTA_basename = fasta_basename.split('.fa')[0]+'_converted.fasta'
36
- converted_FASTA = os.path.join(output_directory, converted_FASTA_basename)
37
- if os.path.exists(converted_FASTA):
38
- print(converted_FASTA + ' already exists. Using existing converted FASTA.')
39
- else:
40
- generate_converted_FASTA(fasta, conversion_types, strands, converted_FASTA)
41
-
42
- # 2) Basecall from the input POD5 to generate a singular output BAM
43
- canoncall(model, pod5_dir, barcode_kit, bam, bam_suffix)
44
-
45
- # 3) Align the BAM to the converted reference FASTA and sort the bam on positional coordinates. Also make an index and a bed file of mapped reads
46
- input_BAM = bam + bam_suffix
47
- align_and_sort_BAM(converted_FASTA, input_BAM, bam_suffix, output_directory)
48
-
49
- ### 4) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory###
50
- split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix)
51
-
52
- # 5) Take the converted BAM and load it into an adata object.
53
- converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experiment_name, conversion_types, bam_suffix)
@@ -1,55 +0,0 @@
1
- ## pod5_direct
2
-
3
- def pod5_direct(fasta, output_directory, mod_list, model, thresholds, pod5_dir, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix, batch_size):
4
- """
5
- Converts a POD5 file from a nanopore native SMF experiment to an adata object.
6
-
7
- Parameters:
8
- fasta (str): File path to the reference genome to align to.
9
- output_directory (str): A file path to the directory to output all the analyses.
10
- mod_list (list): A list of strings of the modification types to use in the analysis.
11
- model (str): a string representing the file path to the dorado basecalling model.
12
- thresholds (list): A list of floats to pass for call thresholds.
13
- pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
14
- split_dir (str): A string representing the file path to the directory to split the BAMs into.
15
- barcode_kit (str): A string representing the barcoding kit used in the experiment.
16
- mapping_threshold (float): A value in between 0 and 1 to threshold the minimal fraction of aligned reads which map to the reference region. References with values above the threshold are included in the output adata.
17
- experiment_name (str): A string to provide an experiment name to the output adata file.
18
- bam_suffix (str): A suffix to add to the bam file.
19
- batch_size (int): An integer number of TSV files to analyze in memory at once while loading the final adata object.
20
-
21
- Returns:
22
- None
23
- """
24
- from .helpers import align_and_sort_BAM, extract_mods, make_modbed, modcall, modkit_extract_to_adata, modQC, split_and_index_BAM, make_dirs
25
- import os
26
- model_basename = os.path.basename(model)
27
- model_basename = model_basename.replace('.', '_')
28
- mod_string = "_".join(mod_list)
29
- bam=f"{output_directory}/{model_basename}_{mod_string}_calls"
30
- aligned_BAM=f"{bam}_aligned"
31
- aligned_sorted_BAM=f"{aligned_BAM}_sorted"
32
- mod_bed_dir=f"{output_directory}/split_mod_beds"
33
- mod_tsv_dir=f"{output_directory}/split_mod_tsvs"
34
-
35
- make_dirs([mod_bed_dir, mod_tsv_dir])
36
-
37
- aligned_sorted_output = aligned_sorted_BAM + bam_suffix
38
- mod_map = {'6mA': '6mA', '5mC_5hmC': '5mC'}
39
- mods = [mod_map[mod] for mod in mod_list]
40
-
41
- os.chdir(output_directory)
42
-
43
- # 1) Basecall using dorado
44
- modcall(model, pod5_dir, barcode_kit, mod_list, bam, bam_suffix)
45
- # 2) Align the BAM to the reference FASTA. Also make an index and a bed file of mapped reads
46
- input_BAM = bam + bam_suffix
47
- align_and_sort_BAM(fasta, input_BAM, bam_suffix, output_directory)
48
- # 3) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory
49
- split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix)
50
- # 4) Using nanopore modkit to work with modified BAM files ###
51
- modQC(aligned_sorted_output, thresholds) # get QC metrics for mod calls
52
- make_modbed(aligned_sorted_output, thresholds, mod_bed_dir) # Generate bed files of position methylation summaries for every sample
53
- extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix) # Extract methylations calls for split BAM files into split TSV files
54
- #5 Load the modification data from TSVs into an adata object
55
- modkit_extract_to_adata(fasta, aligned_sorted_output, mapping_threshold, experiment_name, mods, batch_size)
@@ -1,40 +0,0 @@
1
- ## pod5_to_adata
2
-
3
- def pod5_to_adata(config_path):
4
- """
5
- High-level function to call for converting raw sequencing data to an adata object.
6
-
7
- Parameters:
8
- config_path (str): A string representing the file path to the experiment configuration csv file.
9
-
10
- Returns:
11
- None
12
- """
13
- from .helpers import LoadExperimentConfig, make_dirs
14
- import os
15
- bam_suffix = '.bam' # If different, change from here.
16
- split_dir = 'split_BAMs' # If different, change from here.
17
- strands = ['bottom', 'top'] # If different, change from here. Having both listed generally doesn't slow things down too much.
18
- conversions = ['unconverted'] # The name to use for the unconverted files. If different, change from here.
19
-
20
- # Load experiment config parameters into global variables
21
- experiment_config = LoadExperimentConfig(config_path)
22
- var_dict = experiment_config.var_dict
23
- for key, value in var_dict.items():
24
- globals()[key] = value
25
-
26
- conversions += conversion_types
27
-
28
- split_path = os.path.join(output_directory, split_dir)
29
- make_dirs([output_directory, split_path])
30
- os.chdir(output_directory)
31
-
32
- if smf_modality == 'conversion':
33
- from .pod5_conversion import pod5_conversion
34
- pod5_conversion(fasta, output_directory, conversions, strands, model, pod5_dir, split_path, barcode_kit, mapping_threshold, experiment_name, bam_suffix)
35
- elif smf_modality == 'direct':
36
- from .pod5_direct import pod5_direct
37
- thresholds = [filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold]
38
- pod5_direct(fasta, output_directory, mod_list, model, thresholds, pod5_dir, split_path, barcode_kit, mapping_threshold, experiment_name, bam_suffix, batch_size)
39
- else:
40
- print("Error")