smftools 0.1.1__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. smftools-0.1.6.dist-info/METADATA +127 -0
  2. smftools-0.1.6.dist-info/RECORD +4 -0
  3. smftools/__init__.py +0 -25
  4. smftools/_settings.py +0 -19
  5. smftools/_version.py +0 -1
  6. smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
  7. smftools/datasets/__init__.py +0 -9
  8. smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
  9. smftools/datasets/datasets.py +0 -27
  10. smftools/informatics/__init__.py +0 -12
  11. smftools/informatics/bam_conversion.py +0 -47
  12. smftools/informatics/bam_direct.py +0 -49
  13. smftools/informatics/basecalls_to_adata.py +0 -42
  14. smftools/informatics/fast5_to_pod5.py +0 -19
  15. smftools/informatics/helpers/LoadExperimentConfig.py +0 -74
  16. smftools/informatics/helpers/__init__.py +0 -42
  17. smftools/informatics/helpers/align_and_sort_BAM.py +0 -52
  18. smftools/informatics/helpers/archived/informatics.py +0 -260
  19. smftools/informatics/helpers/archived/load_adata.py +0 -516
  20. smftools/informatics/helpers/binarize_converted_base_identities.py +0 -31
  21. smftools/informatics/helpers/canoncall.py +0 -23
  22. smftools/informatics/helpers/converted_BAM_to_adata.py +0 -164
  23. smftools/informatics/helpers/count_aligned_reads.py +0 -39
  24. smftools/informatics/helpers/extract_base_identities.py +0 -43
  25. smftools/informatics/helpers/extract_mods.py +0 -51
  26. smftools/informatics/helpers/find_conversion_sites.py +0 -59
  27. smftools/informatics/helpers/generate_converted_FASTA.py +0 -79
  28. smftools/informatics/helpers/get_native_references.py +0 -28
  29. smftools/informatics/helpers/make_dirs.py +0 -21
  30. smftools/informatics/helpers/make_modbed.py +0 -27
  31. smftools/informatics/helpers/modQC.py +0 -27
  32. smftools/informatics/helpers/modcall.py +0 -26
  33. smftools/informatics/helpers/modkit_extract_to_adata.py +0 -367
  34. smftools/informatics/helpers/one_hot_encode.py +0 -19
  35. smftools/informatics/helpers/separate_bam_by_bc.py +0 -41
  36. smftools/informatics/helpers/split_and_index_BAM.py +0 -29
  37. smftools/informatics/pod5_conversion.py +0 -53
  38. smftools/informatics/pod5_direct.py +0 -55
  39. smftools/informatics/pod5_to_adata.py +0 -40
  40. smftools/informatics/readwrite.py +0 -106
  41. smftools/informatics/subsample_pod5.py +0 -48
  42. smftools/plotting/__init__.py +0 -0
  43. smftools/preprocessing/__init__.py +0 -29
  44. smftools/preprocessing/append_C_context.py +0 -46
  45. smftools/preprocessing/archives/preprocessing.py +0 -614
  46. smftools/preprocessing/binarize_on_Youden.py +0 -42
  47. smftools/preprocessing/binary_layers_to_ohe.py +0 -30
  48. smftools/preprocessing/calculate_complexity.py +0 -71
  49. smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -45
  50. smftools/preprocessing/calculate_coverage.py +0 -41
  51. smftools/preprocessing/calculate_pairwise_hamming_distances.py +0 -27
  52. smftools/preprocessing/calculate_position_Youden.py +0 -104
  53. smftools/preprocessing/calculate_read_length_stats.py +0 -32
  54. smftools/preprocessing/clean_NaN.py +0 -38
  55. smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -27
  56. smftools/preprocessing/filter_reads_on_length.py +0 -39
  57. smftools/preprocessing/invert_adata.py +0 -22
  58. smftools/preprocessing/mark_duplicates.py +0 -119
  59. smftools/preprocessing/min_non_diagonal.py +0 -25
  60. smftools/preprocessing/remove_duplicates.py +0 -18
  61. smftools/readwrite.py +0 -106
  62. smftools/tools/__init__.py +0 -0
  63. smftools-0.1.1.dist-info/METADATA +0 -88
  64. smftools-0.1.1.dist-info/RECORD +0 -64
  65. {smftools-0.1.1.dist-info → smftools-0.1.6.dist-info}/WHEEL +0 -0
  66. {smftools-0.1.1.dist-info → smftools-0.1.6.dist-info}/licenses/LICENSE +0 -0
@@ -1,516 +0,0 @@
1
- # load_adata
2
- ######################################################################################################
3
- import .utils
4
- # File I/O
5
- import subprocess
6
- import gc
7
-
8
- # bioinformatic operations
9
- import .informatics_module
10
-
11
- # User interface
12
- from tqdm import tqdm
13
-
14
- ######################################################################################################
15
- # Conversion SMF
16
- def converted_BAM_to_adata(converted_fasta_file, bam_directory, mapping_threshold, experiment_name, modification_types=['5mC', '6mA'], strands=['top', 'bottom']):
17
- """
18
- Inputs:
19
- "converted_fasta_file", help="converted FASTA file"
20
- "bam_directory", help="Directory containing input BAMs to binarize"
21
- "mapping_threshold", help="Minimal threshold of mapped reads to a reference chromosome to allow"
22
- "experiment_name", help="String to append to the output h5ad file"
23
- "modification_types", help=" a list of modifications to detect. Options are 5mC and 6mA"
24
- "strands", help="A list of strands to include in the analysis. Options are top and bottom"
25
- Outputs:
26
- Takes a directory of BAM files from conversion SMF and generates a gzipped h5ad file.
27
- """
28
- mapping_threshold = float(args.mapping_threshold)
29
- bam_suffix = '.bam'
30
- # Get all of the input BAM files
31
- files = os.listdir(bam_directory)
32
- # Change directory to the BAM directory
33
- os.chdir(bam_directory)
34
- # Filter file names that contain the search string in their filename and keep them in a list
35
- bams = [bam for bam in files if bam_suffix in bam and '.bai' not in bam]
36
- # Sort file list by names and print the list of file names
37
- bams.sort()
38
- print(f'Found the following BAMS: {bams}')
39
- # Options include 6mA, 5mC
40
- # Options include top and bottom
41
- final_adata = None
42
-
43
- # Make a dictionary, keyed by modification type, that points to another dictionary of unconverted_record_ids. This points to a list of: 1) record length, 2) top strand conversion coordinates, 3) bottom strand conversion coordinates, 4) record sequence
44
- modification_dict = {}
45
- # While populating the dictionary, also extract the longest sequence record in the input references
46
- max_reference_length = 0
47
- for modification_type in modification_types:
48
- modification_dict[modification_type] = find_coordinates(converted_fasta_file, modification_type)
49
- for record in modification_dict[modification_type].keys():
50
- if modification_dict[modification_type][record][0] > max_reference_length:
51
- max_reference_length = modification_dict[modification_type][record][0]
52
- # Iterate over the experiment BAM files
53
- for bam_index, bam in enumerate(bams):
54
- # Give each bam a sample name
55
- sample = bam.split(sep=bam_suffix)[0]
56
- # look at aligned read proportions in the bam
57
- aligned_reads_count, unaligned_reads_count, record_counts = count_aligned_reads(bam)
58
- percent_aligned = aligned_reads_count*100 / (aligned_reads_count+unaligned_reads_count)
59
- print(f'{percent_aligned} percent of total reads in {bam} aligned successfully')
60
-
61
- records_to_analyze = []
62
- # Iterate over converted reference strands and decide which to use in the analysis based on the mapping_threshold
63
- for record in record_counts:
64
- print(f'{record_counts[record][0]} reads mapped to reference record {record}. This is {record_counts[record][1]*100} percent of all mapped reads in the sample.')
65
- if record_counts[record][1] >= mapping_threshold:
66
- records_to_analyze.append(record)
67
- print(f'Records to analyze: {records_to_analyze}')
68
-
69
- # Iterate over records to analyze (ie all conversions detected)
70
- record_FASTA_dict = {}
71
- for record in records_to_analyze:
72
- mod_type, strand = record.split('_')[-2:]
73
- if strand == 'top':
74
- strand_index = 1
75
- elif strand == 'bottom':
76
- strand_index = 2
77
-
78
- chromosome = record.split('_{0}_{1}'.format(mod_type, strand))[0]
79
- unconverted_chromosome_name = chromosome + '_unconverted_top'
80
- positions = modification_dict[mod_type][unconverted_chromosome_name][strand_index]
81
- current_reference_length = modification_dict[mod_type][unconverted_chromosome_name][0]
82
- delta_max_length = max_reference_length - current_reference_length
83
- sequence = modification_dict[mod_type][unconverted_chromosome_name][3] + 'N'*delta_max_length
84
- record_FASTA_dict[f'{record}'] = sequence
85
- print(f'Chromosome: {chromosome}\nUnconverted Sequence: {sequence}')
86
-
87
- # Get a dictionary of positional identities keyed by read id
88
- print(f'Extracting base identities of target positions')
89
- target_base_identities = extract_base_identity_at_coordinates(bam, record, positions, max_reference_length)
90
- # binarize the dictionary of positional identities
91
- print(f'Binarizing base identities of target positions')
92
- binarized_base_identities = binarize_base_identities(target_base_identities, strand, mod_type)
93
- # converts the base identity dictionary to a dataframe.
94
- binarized_base_identities_df = pd.DataFrame.from_dict(binarized_base_identities, orient='index')
95
- sorted_index = sorted(binarized_base_identities_df.index)
96
- binarized_base_identities_df = binarized_base_identities_df.reindex(sorted_index)
97
- # Get the sequence string of every read
98
- print(f'Extracting base identities of all positions in each read')
99
- all_base_identities = extract_base_identity_at_coordinates(bam, record, range(current_reference_length), max_reference_length)
100
- # One hot encode the sequence string of the reads
101
- print(f'One hot encoding base identities of all positions in each read')
102
- one_hot_reads = {read_name: one_hot_encode(seq) for read_name, seq in all_base_identities.items()}
103
-
104
- # Initialize empty DataFrames for each base
105
- read_names = list(one_hot_reads.keys())
106
- sequence_length = one_hot_reads[read_names[0]].shape[0]
107
- df_A = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
108
- df_C = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
109
- df_G = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
110
- df_T = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
111
- df_N = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
112
-
113
- # Iterate through the dictionary and populate the DataFrames
114
- for read_name, one_hot_array in one_hot_reads.items():
115
- df_A.loc[read_name] = one_hot_array[:, 0]
116
- df_C.loc[read_name] = one_hot_array[:, 1]
117
- df_G.loc[read_name] = one_hot_array[:, 2]
118
- df_T.loc[read_name] = one_hot_array[:, 3]
119
- df_N.loc[read_name] = one_hot_array[:, 4]
120
-
121
- ohe_df_map = {0: df_A, 1: df_C, 2: df_G, 3: df_T, 4: df_N}
122
-
123
- # Load an anndata object with the sample data
124
- X = binarized_base_identities_df.values
125
- adata = ad.AnnData(X, dtype=X.dtype)
126
- adata.obs_names = binarized_base_identities_df.index
127
- adata.obs_names = adata.obs_names.astype(str)
128
- adata.var_names = binarized_base_identities_df.columns
129
- adata.var_names = adata.var_names.astype(str)
130
- adata.obs['Sample'] = [sample] * len(adata)
131
- adata.obs['Strand'] = [strand] * len(adata)
132
- adata.obs['Dataset'] = [mod_type] * len(adata)
133
- adata.obs['Reference'] = [record] * len(adata)
134
- adata.obs['Reference_chromosome'] = [chromosome] * len(adata)
135
-
136
- for j, base in enumerate(['A', 'C', 'G', 'T', 'N']):
137
- adata.layers[f'{base}_binary_encoding'] = ohe_df_map[j].values
138
-
139
- if final_adata:
140
- final_adata = ad.concat([final_adata, adata], join='outer', index_unique=None)
141
- else:
142
- final_adata = adata
143
-
144
- for record in record_FASTA_dict.keys():
145
- chromosome = record.split('_')[0]
146
- sequence = record_FASTA_dict[record]
147
- final_adata.uns[f'{record}_FASTA_sequence'] = sequence
148
- final_adata.var[f'{record}_FASTA_sequence'] = list(sequence)
149
- record_subset = final_adata[final_adata.obs['Reference'] == record].copy()
150
- layer_map, layer_counts = {}, []
151
- for i, layer in enumerate(record_subset.layers):
152
- layer_map[i] = layer.split('_')[0]
153
- layer_counts.append(np.sum(record_subset.layers[layer], axis=0))
154
- count_array = np.array(layer_counts)
155
- nucleotide_indexes = np.argmax(count_array, axis=0)
156
- consensus_sequence_list = [layer_map[i] for i in nucleotide_indexes]
157
- final_adata.var[f'{record}_consensus_across_samples'] = consensus_sequence_list
158
- final_adata.uns[f'{record}_consensus_sequence'] = ''.join(consensus_sequence_list)
159
-
160
- ## Export the final adata object
161
- final_adata.write_h5ad('{0}_{1}.h5ad.gz'.format(date_string, experiment_name), compression='gzip')
162
-
163
- # Direct detection SMF
164
- def modkit_extract_to_adata(fasta, bam, mapping_threshold, experiment_name, mods, batch_size):
165
- """
166
- Inputs:
167
- "mods", help="list of modifications to analayze. Available mods [6mA, 5mC]"
168
- "fasta", help="a FASTA file to extract positions of interest from."
169
- "bam", help="a bam file to extract read-level sequence identities."
170
- "mapping_threshold", help="Minimal threshold of mapped reads to a reference chromosome to allow"
171
- "batch_size", help="Number of sample TSV files to process per batch"
172
- "experiment_name", help="An experiment name to add to the final anndata object"
173
- Output:
174
- Take modkit extract sample tsv files and the experiment level BAM file to generate an anndata object
175
- """
176
- mapping_threshold = float(mapping_threshold)
177
-
178
- ###################################################
179
- ### Get input tsv file names into a sorted list ###
180
- # List all files in the directory
181
- files = os.listdir(os.getcwd())
182
- # get current working directory
183
- cwd = os.getcwd()
184
- # Filter file names that contain the search string in their filename and keep them in a list
185
- tsvs = [tsv for tsv in files if 'extract.tsv' in tsv]
186
- # Sort file list by names and print the list of file names
187
- tsvs.sort()
188
- print(f'{len(tsvs)} sample tsv files found: {tsvs}')
189
- print(f'sample bam file found: {bam}')
190
-
191
- # Get all references within the FASTA and indicate the length and identity of the record sequence
192
- max_reference_length = 0
193
- reference_dict = get_references(fasta)
194
- for record in reference_dict.keys():
195
- if reference_dict[record][0] > max_reference_length:
196
- max_reference_length = reference_dict[record][0]
197
-
198
- print(f'{time_string()}: Max reference length in dataset: {max_reference_length}')
199
- batch_size = int(batch_size) # Number of TSVs to maximally process in a batch
200
- batches = math.ceil(len(tsvs) / batch_size) # Number of batches to process
201
- print('{0}: Processing input tsvs in {1} batches of {2} tsvs '.format(time_string(), batches, batch_size))
202
-
203
- # look at aligned read proportions in the bam
204
- aligned_reads_count, unaligned_reads_count, record_counts = count_aligned_reads(bam)
205
- print('{} percent of reads in bam aligned successfully'.format(aligned_reads_count*100 / (aligned_reads_count+unaligned_reads_count)))
206
- records_to_analyze = []
207
- # Iterate over references and decide which to use in the analysis based on the mapping_threshold
208
- for record in record_counts:
209
- print('{0} reads mapped to reference record {1}. This is {2} percent of all mapped reads'.format(record_counts[record][0], record, record_counts[record][1]*100))
210
- if record_counts[record][1] >= mapping_threshold:
211
- records_to_analyze.append(record)
212
- print(f'Records to analyze: {records_to_analyze}')
213
- # Iterate over records to analyze and return a dictionary keyed by the reference name that points to another dictionary keyed by read names that map to that reference. This internal dictionary points to a one-hot encoding of the mapped read
214
- record_seq_dict = {}
215
- for record in records_to_analyze:
216
- current_reference_length = reference_dict[record][0]
217
- delta_max_length = max_reference_length - current_reference_length
218
- sequence = reference_dict[record][1] + 'N'*delta_max_length
219
- # Get a dictionary of positional base identities keyed by read id
220
- base_identities = extract_base_identity_at_coordinates(bam, record, current_reference_length, max_reference_length)
221
- # One hot encode the sequence string of the reads
222
- one_hot_reads = {read_name: one_hot_encode(seq) for read_name, seq in base_identities.items()}
223
- record_seq_dict[record] = (one_hot_reads, sequence)
224
-
225
- ###################################################
226
-
227
- ###################################################
228
- # Begin iterating over batches
229
- for batch in range(batches):
230
- print('{0}: Processing tsvs for batch {1} '.format(time_string(), batch))
231
- # For the final batch, just take the remaining tsv files
232
- if batch == batches - 1:
233
- tsv_batch = tsvs
234
- # For all other batches, take the next batch of tsvs out of the file queue.
235
- else:
236
- tsv_batch = tsvs[:batch_size]
237
- tsvs = tsvs[batch_size:]
238
- print('{0}: tsvs in batch {1} '.format(time_string(), tsv_batch))
239
- ###################################################
240
-
241
- ###################################################
242
- ### Add the tsvs as dataframes to a dictionary (dict_total) keyed by integer index. Also make modification specific dictionaries and strand specific dictionaries.
243
- # Initialize dictionaries and place them in a list
244
- dict_total, dict_a, dict_a_bottom, dict_a_top, dict_c, dict_c_bottom, dict_c_top, dict_combined_bottom, dict_combined_top = {},{},{},{},{},{},{},{},{}
245
- dict_list = [dict_total, dict_a, dict_a_bottom, dict_a_top, dict_c, dict_c_bottom, dict_c_top, dict_combined_bottom, dict_combined_top]
246
-
247
- # Give names to represent each dictionary in the list
248
- sample_types = ['total', 'm6A', 'm6A_bottom_strand', 'm6A_top_strand', '5mC', '5mC_bottom_strand', '5mC_top_strand', 'combined_bottom_strand', 'combined_top_strand']
249
-
250
- # Give indices of dictionaries to skip for analysis and final dictionary saving.
251
- dict_to_skip = [0, 1, 4]
252
- combined_dicts = [7, 8]
253
- A_stranded_dicts = [2, 3]
254
- C_stranded_dicts = [5, 6]
255
- dict_to_skip = dict_to_skip + combined_dicts + A_stranded_dicts + C_stranded_dicts
256
- dict_to_skip = set(dict_to_skip)
257
-
258
- # Load the dict_total dictionary with all of the tsv files as dataframes.
259
- for i, tsv in enumerate(tsv_batch):
260
- print('{0}: Loading sample tsv {1} into dataframe'.format(time_string(), tsv))
261
- temp_df = pd.read_csv(tsv, sep='\t', header=0)
262
- for record in records_to_analyze:
263
- if record not in dict_total.keys():
264
- dict_total[record] = {}
265
- # Only keep the reads aligned to the chromosomes of interest
266
- print('{0}: Filtering sample dataframe to keep chromosome of interest'.format(time_string()))
267
- dict_total[record][i] = temp_df[temp_df['chrom'] == record]
268
- # Only keep the read positions that fall within the region of interest
269
- print('{0}: Filtering sample dataframe to keep positions falling within region of interest'.format(time_string()))
270
- current_reference_length = reference_dict[record][0]
271
- dict_total[record][i] = dict_total[record][i][(current_reference_length > dict_total[record][i]['ref_position']) & (dict_total[record][i]['ref_position']>= 0)]
272
-
273
- # Iterate over dict_total of all the tsv files and extract the modification specific and strand specific dataframes into dictionaries
274
- for record in dict_total.keys():
275
- for i in dict_total[record].keys():
276
- if '6mA' in mods:
277
- # Remove Adenine stranded dicts from the dicts to skip set
278
- dict_to_skip.difference_update(A_stranded_dicts)
279
-
280
- if record not in dict_a.keys() and record not in dict_a_bottom.keys() and record not in dict_a_top.keys():
281
- dict_a[record], dict_a_bottom[record], dict_a_top[record] = {}, {}, {}
282
-
283
- # get a dictionary of dataframes that only contain methylated adenine positions
284
- dict_a[record][i] = dict_total[record][i][dict_total[record][i]['modified_primary_base'] == 'A']
285
- print('{}: Successfully created a methyl-adenine dictionary for '.format(time_string()) + str(i))
286
- # Stratify the adenine dictionary into two strand specific dictionaries.
287
- dict_a_bottom[record][i] = dict_a[record][i][dict_a[record][i]['ref_strand'] == '-']
288
- print('{}: Successfully created a minus strand methyl-adenine dictionary for '.format(time_string()) + str(i))
289
- dict_a_top[record][i] = dict_a[record][i][dict_a[record][i]['ref_strand'] == '+']
290
- print('{}: Successfully created a plus strand methyl-adenine dictionary for '.format(time_string()) + str(i))
291
-
292
- if '5mC' in mods:
293
- # Remove Cytosine stranded dicts from the dicts to skip set
294
- dict_to_skip.difference_update(C_stranded_dicts)
295
-
296
- if record not in dict_c.keys() and record not in dict_c_bottom.keys() and record not in dict_c_top.keys():
297
- dict_c[record], dict_c_bottom[record], dict_c_top[record] = {}, {}, {}
298
-
299
- # get a dictionary of dataframes that only contain methylated cytosine positions
300
- dict_c[record][i] = dict_total[record][i][dict_total[record][i]['modified_primary_base'] == 'C']
301
- print('{}: Successfully created a methyl-cytosine dictionary for '.format(time_string()) + str(i))
302
- # Stratify the cytosine dictionary into two strand specific dictionaries.
303
- dict_c_bottom[record][i] = dict_c[record][i][dict_c[record][i]['ref_strand'] == '-']
304
- print('{}: Successfully created a minus strand methyl-cytosine dictionary for '.format(time_string()) + str(i))
305
- dict_c_top[record][i] = dict_c[record][i][dict_c[record][i]['ref_strand'] == '+']
306
- print('{}: Successfully created a plus strand methyl-cytosine dictionary for '.format(time_string()) + str(i))
307
- # In the strand specific dictionaries, only keep positions that are informative for GpC SMF
308
-
309
- if '6mA' in mods and '5mC' in mods:
310
- # Remove combined stranded dicts from the dicts to skip set
311
- dict_to_skip.difference_update(combined_dicts)
312
- # Initialize the sample keys for the combined dictionaries
313
-
314
- if record not in dict_combined_bottom.keys() and record not in dict_combined_top.keys():
315
- dict_combined_bottom[record], dict_combined_top[record]= {}, {}
316
-
317
- print('{}: Successfully created a minus strand combined methylation dictionary for '.format(time_string()) + str(i))
318
- dict_combined_bottom[record][i] = []
319
- print('{}: Successfully created a plus strand combined methylation dictionary for '.format(time_string()) + str(i))
320
- dict_combined_top[record][i] = []
321
-
322
- # Iterate over the stranded modification dictionaries and replace the dataframes with a dictionary of read names pointing to a list of values from the dataframe
323
- for i, dict_type in enumerate(dict_list):
324
- # Only iterate over stranded dictionaries
325
- if i not in dict_to_skip:
326
- print('{0}: Extracting methylation states for {1} dictionary'.format(time_string(), sample_types[i]))
327
- for record in dict_type.keys():
328
- # Get the dictionary for the modification type of interest from the reference mapping of interest
329
- dict = dict_type[record]
330
- print('{0}: Extracting methylation states for {1} dictionary'.format(time_string(), record))
331
- # For each sample in a stranded dictionary
332
- for sample in dict.keys():
333
- print('{0}: Extracting {1} dictionary from record {2} for sample {3}'.format(time_string(), sample_types[i], record, sample))
334
- # Load the combined bottom strand dictionary after all the individual dictionaries have been made for the sample
335
- if i == 7:
336
- # Load the minus strand dictionaries for each sample into temporary variables
337
- temp_a_dict = dict_list[2][record][sample].copy()
338
- temp_c_dict = dict_list[5][record][sample].copy()
339
- dict[sample] = {}
340
- # Iterate over the reads present in the merge of both dictionaries
341
- for read in set(temp_a_dict) | set(temp_c_dict):
342
- # Add the arrays element-wise if the read is present in both dictionaries
343
- if read in temp_a_dict and read in temp_c_dict:
344
- dict[sample][read] = np.nansum([temp_a_dict[read], temp_c_dict[read]], axis=0)
345
- # If the read is present in only one dictionary, copy its value
346
- elif read in temp_a_dict:
347
- dict[sample][read] = temp_a_dict[read]
348
- else:
349
- dict[sample][read] = temp_c_dict[read]
350
- # Load the combined top strand dictionary after all the individual dictionaries have been made for the sample
351
- elif i == 8:
352
- # Load the plus strand dictionaries for each sample into temporary variables
353
- temp_a_dict = dict_list[3][record][sample].copy()
354
- temp_c_dict = dict_list[6][record][sample].copy()
355
- dict[sample] = {}
356
- # Iterate over the reads present in the merge of both dictionaries
357
- for read in set(temp_a_dict) | set(temp_c_dict):
358
- # Add the arrays element-wise if the read is present in both dictionaries
359
- if read in temp_a_dict and read in temp_c_dict:
360
- dict[sample][read] = np.nansum([temp_a_dict[read], temp_c_dict[read]], axis=0)
361
- # If the read is present in only one dictionary, copy its value
362
- elif read in temp_a_dict:
363
- dict[sample][read] = temp_a_dict[read]
364
- else:
365
- dict[sample][read] = temp_c_dict[read]
366
- # For all other dictionaries
367
- else:
368
- # extract the dataframe from the dictionary into a temporary variable
369
- temp_df = dict[sample]
370
- # reassign the dictionary pointer to a nested dictionary.
371
- dict[sample] = {}
372
- # # Iterate through rows in the temp DataFrame
373
- for index, row in temp_df.iterrows():
374
- read = row['read_id'] # read name
375
- position = row['ref_position'] # positional coordinate
376
- probability = row['call_prob'] # Get the probability of the given call
377
- # if the call_code is modified change methylated value to the probability of methylation
378
- if (row['call_code'] in ['a', 'h', 'm']):
379
- methylated = probability
380
- # If the call code is canonical, change the methylated value to 1 - the probability of canonical
381
- elif (row['call_code'] in ['-']):
382
- methylated = 1 - probability
383
-
384
- # If the current read is not in the dictionary yet, initalize the dictionary with a nan filled numpy array of proper size.
385
- if read not in dict[sample]:
386
- dict[sample][read] = np.full(max_reference_length, np.nan)
387
- else:
388
- pass
389
- # add the positional methylation state to the numpy array
390
- dict[sample][read][position-1] = methylated
391
-
392
- # Save the sample files in the batch as gzipped hdf5 files
393
- print('{0}: Converting batch {1} dictionaries to anndata objects'.format(time_string(), batch))
394
- for i, dict_type in enumerate(dict_list):
395
- if i not in dict_to_skip:
396
- # Initialize an hdf5 file for the current modified strand
397
- adata = None
398
- print('{0}: Converting {1} dictionary to an anndata object'.format(time_string(), sample_types[i]))
399
- for record in dict_type.keys():
400
- # Get the dictionary for the modification type of interest from the reference mapping of interest
401
- dict = dict_type[record]
402
- for sample in dict.keys():
403
- print('{0}: Converting {1} dictionary for sample {2} to an anndata object'.format(time_string(), sample_types[i], sample))
404
- sample = int(sample)
405
- final_sample_index = sample + (batch * batch_size)
406
- print('{0}: Final sample index for sample: {1}'.format(time_string(), final_sample_index))
407
- print('{0}: Converting {1} dictionary for sample {2} to a dataframe'.format(time_string(), sample_types[i], final_sample_index))
408
- temp_df = pd.DataFrame.from_dict(dict[sample], orient='index')
409
- sorted_index = sorted(temp_df.index)
410
- temp_df = temp_df.reindex(sorted_index)
411
- X = temp_df.values
412
- one_hot_encodings = record_seq_dict[record][0]
413
- # Initialize empty DataFrames for each base
414
- read_names = list(one_hot_encodings.keys())
415
- sequence_length = one_hot_encodings[read_names[0]].shape[0]
416
- df_A = pd.DataFrame(np.nan, index=sorted_index, columns=range(sequence_length))
417
- df_C = pd.DataFrame(np.nan, index=sorted_index, columns=range(sequence_length))
418
- df_G = pd.DataFrame(np.nan, index=sorted_index, columns=range(sequence_length))
419
- df_T = pd.DataFrame(np.nan, index=sorted_index, columns=range(sequence_length))
420
- df_N = pd.DataFrame(np.nan, index=sorted_index, columns=range(sequence_length))
421
-
422
- # Iterate through the dictionary and populate the DataFrames
423
- for read_name, one_hot_array in one_hot_encodings.items():
424
- df_A.loc[read_name] = one_hot_array[:, 0]
425
- df_C.loc[read_name] = one_hot_array[:, 1]
426
- df_G.loc[read_name] = one_hot_array[:, 2]
427
- df_T.loc[read_name] = one_hot_array[:, 3]
428
- df_N.loc[read_name] = one_hot_array[:, 4]
429
-
430
- ohe_df_map = {0: df_A, 1: df_C, 2: df_G, 3: df_T, 4: df_N}
431
-
432
- print('{0}: Loading {1} dataframe for sample {2} into a temp anndata object'.format(time_string(), sample_types[i], final_sample_index))
433
- temp_adata = sc.AnnData(X, dtype=X.dtype)
434
- print('{0}: Adding read names and position ids to {1} anndata for sample {2}'.format(time_string(), sample_types[i], final_sample_index))
435
- temp_adata.obs_names = temp_df.index
436
- temp_adata.obs_names = temp_adata.obs_names.astype(str)
437
- temp_adata.var_names = temp_df.columns
438
- temp_adata.var_names = temp_adata.var_names.astype(str)
439
- print('{0}: Adding final sample id to {1} anndata for sample {2}'.format(time_string(), sample_types[i], final_sample_index))
440
- temp_adata.obs['Sample'] = [str(final_sample_index)] * len(temp_adata)
441
- dataset, strand = sample_types[i].split('_')[:2]
442
- temp_adata.obs['Strand'] = [strand] * len(temp_adata)
443
- temp_adata.obs['Dataset'] = [dataset] * len(temp_adata)
444
- temp_adata.obs['Reference'] = [f'{record}_{dataset}_{strand}'] * len(temp_adata)
445
- temp_adata.obs['Reference_chromosome'] = [f'{record}'] * len(temp_adata)
446
-
447
- for j, base in enumerate(['A', 'C', 'G', 'T', 'N']):
448
- temp_adata.layers[f'{base}_binary_encoding'] = ohe_df_map[j].values
449
-
450
- # If final adata object already has a sample loaded, concatenate the current sample into the existing adata object
451
- if adata:
452
- print('{0}: Concatenating {1} anndata object for sample {2}'.format(time_string(), sample_types[i], final_sample_index))
453
- adata = ad.concat([adata, temp_adata], join='outer', index_unique=None)
454
- else:
455
- print('{0}: Initializing {1} anndata object for sample {2}'.format(time_string(), sample_types[i], final_sample_index))
456
- adata = temp_adata
457
-
458
- print('{0}: Writing {1} anndata out as a gzipped hdf5 file'.format(time_string(), sample_types[i]))
459
- adata.write_h5ad('{0}_{1}_{2}_SMF_binarized_sample_hdf5.h5ad.gz'.format(date_string, batch, sample_types[i]), compression='gzip')
460
-
461
- # Delete the batch dictionaries from memory
462
- del dict_list
463
- gc.collect()
464
-
465
- # Iterate over all of the batched hdf5 files and concatenate them.
466
- files = os.listdir(os.getcwd())
467
- # Name the final output file
468
- final_hdf = '{0}_{1}_final_experiment_hdf5.h5ad.gz'.format(date_string, args.experiment_name)
469
- # Filter file names that contain the search string in their filename and keep them in a list
470
- hdfs = [hdf for hdf in files if 'hdf5.h5ad' in hdf and hdf != final_hdf]
471
- # Sort file list by names and print the list of file names
472
- hdfs.sort()
473
- print('{0} sample files found: {1}'.format(len(hdfs), hdfs))
474
- final_adata = None
475
- for hdf in hdfs:
476
- print('{0}: Reading in {1} hdf5 file'.format(time_string(), hdf))
477
- temp_adata = sc.read_h5ad(hdf)
478
- if final_adata:
479
- print('{0}: Concatenating final adata object with {1} hdf5 file'.format(time_string(), hdf))
480
- final_adata = ad.concat([final_adata, temp_adata], join='outer', index_unique=None)
481
- else:
482
- print('{0}: Initializing final adata object with {1} hdf5 file'.format(time_string(), hdf))
483
- final_adata = temp_adata
484
- print('{0}: Writing final concatenated hdf5 file'.format(time_string()))
485
-
486
- for record in records_to_analyze:
487
- # Add FASTA sequence to the object
488
- sequence = record_seq_dict[record][1]
489
- final_adata.uns[f'{record}_FASTA_sequence'] = sequence
490
- final_adata.var[f'{record}_FASTA_sequence_base'] = list(sequence)
491
-
492
- # Add consensus sequence of samples mapped to the record to the object
493
- record_subset = final_adata[final_adata.obs['Reference_chromosome'] == record].copy()
494
- layer_map, layer_counts = {}, []
495
- for i, layer in enumerate(record_subset.layers):
496
- layer_map[i] = layer.split('_')[0]
497
- layer_counts.append(np.sum(record_subset.layers[layer], axis=0))
498
- count_array = np.array(layer_counts)
499
- nucleotide_indexes = np.argmax(count_array, axis=0)
500
- consensus_sequence_list = [layer_map[i] for i in nucleotide_indexes]
501
- final_adata.var[f'{record}_consensus_across_samples'] = consensus_sequence_list
502
- final_adata.uns[f'{record}_consensus_sequence'] = ''.join(consensus_sequence_list)
503
-
504
- final_adata.write_h5ad(final_hdf, compression='gzip')
505
-
506
- # Delete the individual h5ad files and only keep the final concatenated file
507
- files = os.listdir(os.getcwd())
508
- hdfs_to_delete = [hdf for hdf in files if 'hdf5.h5ad' in hdf and hdf != final_hdf]
509
- # Iterate over the files and delete them
510
- for hdf in hdfs_to_delete:
511
- try:
512
- os.remove(hdf)
513
- print(f"Deleted file: {hdf}")
514
- except OSError as e:
515
- print(f"Error deleting file {hdf}: {e}")
516
- ######################################################################################################
@@ -1,31 +0,0 @@
1
- ## binarize_converted_base_identities
2
- # Conversion SMF specific
3
- def binarize_converted_base_identities(base_identities, strand, modification_type):
4
- """
5
- Binarizes conversion SMF data within a sequence string
6
-
7
- Parameters:
8
- base_identities (dict): A dictionary returned by extract_base_identity_at_coordinates.
9
- strand (str): A string indicating which strand was converted in the experiment (options are 'top' and 'bottom').
10
- modification_type (str): A string indicating the modification type of interest (options are '5mC' and '6mA').
11
-
12
- Returns:
13
- binarized_base_identities (dict): A binarized dictionary, where 1 represents a methylated site. 0 represents an unmethylated site. NaN represents a site that does not carry methylation information.
14
- """
15
- import numpy as np
16
- binarized_base_identities = {}
17
- # Iterate over base identity keys to binarize the base identities
18
- for key in base_identities.keys():
19
- if strand == 'top':
20
- if modification_type == '5mC':
21
- binarized_base_identities[key] = [1 if x == 'C' else 0 if x == 'T' else np.nan for x in base_identities[key]]
22
- elif modification_type == '6mA':
23
- binarized_base_identities[key] = [1 if x == 'A' else 0 if x == 'G' else np.nan for x in base_identities[key]]
24
- elif strand == 'bottom':
25
- if modification_type == '5mC':
26
- binarized_base_identities[key] = [1 if x == 'G' else 0 if x == 'A' else np.nan for x in base_identities[key]]
27
- elif modification_type == '6mA':
28
- binarized_base_identities[key] = [1 if x == 'T' else 0 if x == 'C' else np.nan for x in base_identities[key]]
29
- else:
30
- pass
31
- return binarized_base_identities
@@ -1,23 +0,0 @@
1
- ## canoncall
2
-
3
- # Conversion SMF specific
4
- def canoncall(model, pod5_dir, barcode_kit, bam, bam_suffix):
5
- """
6
- Wrapper function for dorado canonical base calling.
7
-
8
- Parameters:
9
- model (str): a string representing the file path to the dorado basecalling model.
10
- pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
11
- barcode_kit (str): A string reppresenting the barcoding kit used in the experiment.
12
- bam (str): File path to the BAM file to output.
13
- bam_suffix (str): The suffix to use for the BAM file.
14
-
15
- Returns:
16
- None
17
- Outputs a BAM file holding the canonical base calls output by the dorado basecaller.
18
- """
19
- import subprocess
20
- output = bam + bam_suffix
21
- command = ["dorado", "basecaller", model, pod5_dir, "--kit-name", barcode_kit, "-Y"]
22
- with open(output, "w") as outfile:
23
- subprocess.run(command, stdout=outfile)