smftools 0.1.3__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. {smftools-0.1.3.dist-info → smftools-0.1.6.dist-info}/METADATA +44 -11
  2. smftools-0.1.6.dist-info/RECORD +4 -0
  3. smftools/__init__.py +0 -25
  4. smftools/_settings.py +0 -20
  5. smftools/_version.py +0 -1
  6. smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
  7. smftools/datasets/F1_sample_sheet.csv +0 -5
  8. smftools/datasets/__init__.py +0 -9
  9. smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
  10. smftools/datasets/datasets.py +0 -28
  11. smftools/informatics/__init__.py +0 -14
  12. smftools/informatics/archived/bam_conversion.py +0 -59
  13. smftools/informatics/archived/bam_direct.py +0 -63
  14. smftools/informatics/archived/basecalls_to_adata.py +0 -71
  15. smftools/informatics/conversion_smf.py +0 -79
  16. smftools/informatics/direct_smf.py +0 -89
  17. smftools/informatics/fast5_to_pod5.py +0 -21
  18. smftools/informatics/helpers/LoadExperimentConfig.py +0 -74
  19. smftools/informatics/helpers/__init__.py +0 -60
  20. smftools/informatics/helpers/align_and_sort_BAM.py +0 -48
  21. smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -73
  22. smftools/informatics/helpers/archived/informatics.py +0 -260
  23. smftools/informatics/helpers/archived/load_adata.py +0 -516
  24. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  25. smftools/informatics/helpers/binarize_converted_base_identities.py +0 -31
  26. smftools/informatics/helpers/canoncall.py +0 -25
  27. smftools/informatics/helpers/complement_base_list.py +0 -21
  28. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -54
  29. smftools/informatics/helpers/converted_BAM_to_adata.py +0 -233
  30. smftools/informatics/helpers/count_aligned_reads.py +0 -43
  31. smftools/informatics/helpers/extract_base_identities.py +0 -57
  32. smftools/informatics/helpers/extract_mods.py +0 -51
  33. smftools/informatics/helpers/extract_readnames_from_BAM.py +0 -22
  34. smftools/informatics/helpers/find_conversion_sites.py +0 -61
  35. smftools/informatics/helpers/generate_converted_FASTA.py +0 -98
  36. smftools/informatics/helpers/get_chromosome_lengths.py +0 -32
  37. smftools/informatics/helpers/get_native_references.py +0 -28
  38. smftools/informatics/helpers/index_fasta.py +0 -12
  39. smftools/informatics/helpers/make_dirs.py +0 -21
  40. smftools/informatics/helpers/make_modbed.py +0 -27
  41. smftools/informatics/helpers/modQC.py +0 -27
  42. smftools/informatics/helpers/modcall.py +0 -28
  43. smftools/informatics/helpers/modkit_extract_to_adata.py +0 -518
  44. smftools/informatics/helpers/ohe_batching.py +0 -52
  45. smftools/informatics/helpers/one_hot_encode.py +0 -21
  46. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -52
  47. smftools/informatics/helpers/separate_bam_by_bc.py +0 -43
  48. smftools/informatics/helpers/split_and_index_BAM.py +0 -41
  49. smftools/informatics/load_adata.py +0 -127
  50. smftools/informatics/readwrite.py +0 -106
  51. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  52. smftools/informatics/subsample_pod5.py +0 -104
  53. smftools/plotting/__init__.py +0 -0
  54. smftools/preprocessing/__init__.py +0 -34
  55. smftools/preprocessing/append_C_context.py +0 -69
  56. smftools/preprocessing/archives/preprocessing.py +0 -614
  57. smftools/preprocessing/binarize_on_Youden.py +0 -42
  58. smftools/preprocessing/binary_layers_to_ohe.py +0 -30
  59. smftools/preprocessing/calculate_complexity.py +0 -71
  60. smftools/preprocessing/calculate_consensus.py +0 -47
  61. smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -96
  62. smftools/preprocessing/calculate_coverage.py +0 -41
  63. smftools/preprocessing/calculate_pairwise_hamming_distances.py +0 -27
  64. smftools/preprocessing/calculate_position_Youden.py +0 -104
  65. smftools/preprocessing/calculate_read_length_stats.py +0 -86
  66. smftools/preprocessing/clean_NaN.py +0 -38
  67. smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -29
  68. smftools/preprocessing/filter_reads_on_length.py +0 -41
  69. smftools/preprocessing/invert_adata.py +0 -23
  70. smftools/preprocessing/load_sample_sheet.py +0 -24
  71. smftools/preprocessing/make_dirs.py +0 -21
  72. smftools/preprocessing/mark_duplicates.py +0 -134
  73. smftools/preprocessing/min_non_diagonal.py +0 -25
  74. smftools/preprocessing/recipes.py +0 -125
  75. smftools/preprocessing/remove_duplicates.py +0 -21
  76. smftools/readwrite.py +0 -106
  77. smftools/tools/__init__.py +0 -0
  78. smftools/tools/apply_HMM.py +0 -1
  79. smftools/tools/cluster.py +0 -0
  80. smftools/tools/read_HMM.py +0 -1
  81. smftools/tools/subset_adata.py +0 -32
  82. smftools/tools/train_HMM.py +0 -43
  83. smftools-0.1.3.dist-info/RECORD +0 -84
  84. {smftools-0.1.3.dist-info → smftools-0.1.6.dist-info}/WHEEL +0 -0
  85. {smftools-0.1.3.dist-info → smftools-0.1.6.dist-info}/licenses/LICENSE +0 -0
@@ -1,54 +0,0 @@
1
- # concatenate_fastqs_to_bam
2
-
3
- def concatenate_fastqs_to_bam(fastq_files, output_bam, barcode_tag='BC', gzip_suffix='.gz'):
4
- """
5
- Concatenate multiple demultiplexed FASTQ (.fastq or .fq) files into an unaligned BAM and add the FASTQ barcode suffix to the BC tag.
6
-
7
- Parameters:
8
- fastq_files (list): List of paths to demultiplexed FASTQ files.
9
- output_bam (str): Path to the output BAM file.
10
- barcode_tag (str): The SAM tag for storing the barcode (default: 'BC').
11
- gzip_suffix (str): Suffix to use for input gzip files (Defaul: '.gz')
12
-
13
- Returns:
14
- None
15
- """
16
- import os
17
- import pysam
18
- import gzip
19
- from Bio import SeqIO
20
- from tqdm import tqdm
21
-
22
- n_fastqs = len(fastq_files)
23
-
24
- with pysam.AlignmentFile(output_bam, "wb", header={"HD": {"VN": "1.0"}, "SQ": []}) as bam_out:
25
- for fastq_file in tqdm(fastq_files, desc="Processing FASTQ files"):
26
- # Extract barcode from the FASTQ filename (handles .fq, .fastq, .fq.gz, and .fastq.gz extensions)
27
- base_name = os.path.basename(fastq_file)
28
- if n_fastqs > 1:
29
- if base_name.endswith('.fastq.gz'):
30
- barcode = base_name.split('_')[-1].replace(f'.fastq{gzip_suffix}', '')
31
- elif base_name.endswith('.fq.gz'):
32
- barcode = base_name.split('_')[-1].replace(f'.fq{gzip_suffix}', '')
33
- elif base_name.endswith('.fastq'):
34
- barcode = base_name.split('_')[-1].replace('.fastq', '')
35
- elif base_name.endswith('.fq'):
36
- barcode = base_name.split('_')[-1].replace('.fq', '')
37
- else:
38
- raise ValueError(f"Unexpected file extension for {fastq_file}. Only .fq, .fastq, .fq{gzip_suffix}, and .fastq{gzip_suffix} are supported.")
39
-
40
- # Read the FASTQ file (handle gzipped and non-gzipped files)
41
- open_func = gzip.open if fastq_file.endswith(gzip_suffix) else open
42
- with open_func(fastq_file, 'rt') as fq_in:
43
- for record in SeqIO.parse(fq_in, 'fastq'):
44
- # Create an unaligned BAM entry for each FASTQ record
45
- aln = pysam.AlignedSegment()
46
- aln.query_name = record.id
47
- aln.query_sequence = str(record.seq)
48
- aln.flag = 4 # Unmapped
49
- aln.query_qualities = pysam.qualitystring_to_array(record.letter_annotations["phred_quality"])
50
- if n_fastqs > 1:
51
- # Add the barcode to the BC tag
52
- aln.set_tag(barcode_tag, barcode)
53
- # Write to BAM file
54
- bam_out.write(aln)
@@ -1,233 +0,0 @@
1
- ## converted_BAM_to_adata
2
-
3
- def converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experiment_name, conversion_types, bam_suffix):
4
- """
5
- A wrapper function to take converted aligned_sorted_split BAM files and format the data into an anndata object.
6
-
7
- Parameters:
8
- converted_FASTA (str): A string representing the file path to the converted FASTA reference.
9
- split_dir (str): A string representing the file path to the directory containing the converted aligned_sorted_split BAM files.
10
- mapping_threshold (float): A value in between 0 and 1 to threshold the minimal fraction of aligned reads which map to the reference region. References with values above the threshold are included in the output adata.
11
- experiment_name (str): A string to provide an experiment name to the output adata file.
12
- conversion_types (list): A list of strings of the conversion types to use in the analysis.
13
- bam_suffix (str): The suffix to use for the BAM file.
14
-
15
- Returns:
16
- None
17
- Outputs a single gzipped adata object for the experiment.
18
- """
19
- from .. import readwrite
20
- from .binarize_converted_base_identities import binarize_converted_base_identities
21
- from .find_conversion_sites import find_conversion_sites
22
- from .count_aligned_reads import count_aligned_reads
23
- from .extract_base_identities import extract_base_identities
24
- from .make_dirs import make_dirs
25
- from .ohe_batching import ohe_batching
26
- import pandas as pd
27
- import numpy as np
28
- import anndata as ad
29
- import os
30
- from tqdm import tqdm
31
- import gc
32
-
33
- ##########################################################################################
34
- ## Get file paths and make necessary directories. ##
35
- # Get all of the input BAM files
36
- files = os.listdir(split_dir)
37
- # Make output dir
38
- parent_dir = os.path.dirname(split_dir)
39
- h5_dir = os.path.join(parent_dir, 'h5ads')
40
- tmp_dir = os.path.join(parent_dir, 'tmp')
41
- make_dirs([h5_dir, tmp_dir])
42
- # Filter file names that contain the search string in their filename and keep them in a list
43
- bams = [bam for bam in files if bam_suffix in bam and '.bai' not in bam]
44
- # Sort file list by names and print the list of file names
45
- bams.sort()
46
- bam_path_list = [os.path.join(split_dir, bam) for bam in bams]
47
- print(f'Found the following BAMS: {bams}')
48
- final_adata = None
49
- ##########################################################################################
50
-
51
- ##########################################################################################
52
-
53
- ## need to fix this section
54
- # Make a dictionary, keyed by modification type, that points to another dictionary of unconverted_record_ids. This points to a list of: 1) record length, 2) top strand conversion coordinates, 3) bottom strand conversion coordinates, 4) sequence string unconverted , 5) Complement sequence unconverted
55
- modification_dict = {}
56
- # Init a dict to be keyed by FASTA record that points to the sequence string of the unconverted record
57
- record_FASTA_dict = {}
58
- # While populating the dictionary, also extract the longest sequence record in the input references
59
- max_reference_length = 0
60
- for conversion_type in conversion_types:
61
- # Points to a list containing: 1) sequence length of the record, 2) top strand coordinate list, 3) bottom strand coorinate list, 4) sequence string unconverted , 5) Complement sequence unconverted
62
- modification_dict[conversion_type] = find_conversion_sites(converted_FASTA, conversion_type, conversion_types)
63
- # Get the max reference length
64
- for record in modification_dict[conversion_type].keys():
65
- if modification_dict[conversion_type][record][0] > max_reference_length:
66
- max_reference_length = modification_dict[conversion_type][record][0]
67
-
68
- mod_type, strand = record.split('_')[-2:]
69
-
70
- chromosome = record.split('_{0}_{1}'.format(mod_type, strand))[0]
71
- unconverted_chromosome_name = f'{chromosome}_{conversion_types[0]}_top'
72
- current_reference_length = modification_dict[mod_type][unconverted_chromosome_name][0]
73
- delta_max_length = max_reference_length - current_reference_length
74
- sequence = modification_dict[mod_type][unconverted_chromosome_name][3] + 'N'*delta_max_length
75
- complement = modification_dict[mod_type][unconverted_chromosome_name][4] + 'N'*delta_max_length
76
- record_FASTA_dict[record] = [sequence, complement, chromosome, unconverted_chromosome_name, current_reference_length, delta_max_length, conversion_type, strand]
77
- ##########################################################################################
78
-
79
- ##########################################################################################
80
- bam_alignment_stats_dict = {}
81
- records_to_analyze = []
82
- for bam_index, bam in enumerate(bam_path_list):
83
- bam_alignment_stats_dict[bam_index] = {}
84
- # look at aligned read proportions in the bam
85
- aligned_reads_count, unaligned_reads_count, record_counts = count_aligned_reads(bam)
86
- percent_aligned = aligned_reads_count*100 / (aligned_reads_count+unaligned_reads_count)
87
- print(f'{percent_aligned} percent of total reads in {bams[bam_index]} aligned successfully')
88
- bam_alignment_stats_dict[bam_index]['Total'] = (aligned_reads_count, percent_aligned)
89
- # Iterate over converted reference strands and decide which to use in the analysis based on the mapping_threshold
90
- for record in record_counts:
91
- print(f'{record_counts[record][0]} reads mapped to reference record {record}. This is {record_counts[record][1]*100} percent of all mapped reads in the sample.')
92
- if record_counts[record][1] >= mapping_threshold:
93
- records_to_analyze.append(record)
94
- bam_alignment_stats_dict[bam_index]
95
- bam_alignment_stats_dict[bam_index][record] = (record_counts[record][0], record_counts[record][1]*100)
96
- records_to_analyze = set(records_to_analyze)
97
- ##########################################################################################
98
-
99
- ##########################################################################################
100
- # One hot encode read sequences and write them out into the tmp_dir as h5ad files.
101
- # Save the file paths in the bam_record_ohe_files dict.
102
- bam_record_ohe_files = {}
103
-
104
- # Iterate over split bams
105
- for bam_index, bam in enumerate(bam_path_list):
106
- # Iterate over references to process
107
- for record in records_to_analyze:
108
- unconverted_record_name = "_".join(record.split('_')[:-2]) + '_unconverted_top'
109
- sample = bams[bam_index].split(sep=bam_suffix)[0]
110
- chromosome = record_FASTA_dict[unconverted_record_name][2]
111
- current_reference_length = record_FASTA_dict[unconverted_record_name][4]
112
- mod_type = record_FASTA_dict[unconverted_record_name][6]
113
- strand = record_FASTA_dict[unconverted_record_name][7]
114
-
115
- # Extract the base identities of reads aligned to the record
116
- fwd_base_identities, rev_base_identities = extract_base_identities(bam, record, range(current_reference_length), max_reference_length)
117
-
118
- # binarize the dictionary of positional identities
119
- print(f'Binarizing base identities')
120
- fwd_binarized_base_identities = binarize_converted_base_identities(fwd_base_identities, strand, mod_type)
121
- rev_binarized_base_identities = binarize_converted_base_identities(rev_base_identities, strand, mod_type)
122
- merged_binarized_base_identities = {**fwd_binarized_base_identities, **rev_binarized_base_identities}
123
- # converts the base identity dictionary to a dataframe.
124
- binarized_base_identities_df = pd.DataFrame.from_dict(merged_binarized_base_identities, orient='index')
125
- sorted_index = sorted(binarized_base_identities_df.index)
126
- binarized_base_identities_df = binarized_base_identities_df.reindex(sorted_index)
127
-
128
- # Load an anndata object with the sample data
129
- X = binarized_base_identities_df.values
130
- adata = ad.AnnData(X, dtype=X.dtype)
131
- if adata.shape[0] > 0:
132
- adata.obs_names = binarized_base_identities_df.index.astype(str)
133
- adata.var_names = binarized_base_identities_df.columns.astype(str)
134
- adata.obs['Sample'] = [sample] * len(adata)
135
- adata.obs['Strand'] = [strand] * len(adata)
136
- adata.obs['Dataset'] = [mod_type] * len(adata)
137
- adata.obs['Reference'] = [record] * len(adata)
138
- adata.obs['Reference_chromosome'] = [chromosome] * len(adata)
139
-
140
- read_mapping_direction = []
141
- for read_id in adata.obs_names:
142
- if read_id in fwd_base_identities.keys():
143
- read_mapping_direction.append('fwd')
144
- elif read_id in rev_base_identities.keys():
145
- read_mapping_direction.append('rev')
146
- else:
147
- read_mapping_direction.append('unk')
148
-
149
- adata.obs['Read_mapping_direction'] = read_mapping_direction
150
-
151
- # One hot encode the sequence string of the reads
152
- fwd_ohe_files = ohe_batching(fwd_base_identities, tmp_dir, record, f"{bam_index}_fwd",batch_size=100000)
153
- rev_ohe_files = ohe_batching(rev_base_identities, tmp_dir, record, f"{bam_index}_rev",batch_size=100000)
154
- bam_record_ohe_files[f'{bam_index}_{record}'] = fwd_ohe_files + rev_ohe_files
155
- del fwd_base_identities, rev_base_identities
156
-
157
- one_hot_reads = {}
158
- n_rows_OHE = 5
159
- for ohe_file in tqdm(bam_record_ohe_files[f'{bam_index}_{record}'], desc="Reading in OHE reads"):
160
- tmp_ohe_dict = ad.read_h5ad(ohe_file).uns
161
- one_hot_reads.update(tmp_ohe_dict)
162
- del tmp_ohe_dict
163
-
164
- read_names = list(one_hot_reads.keys())
165
- dict_A, dict_C, dict_G, dict_T, dict_N = {}, {}, {}, {}, {}
166
-
167
- sequence_length = one_hot_reads[read_names[0]].reshape(n_rows_OHE, -1).shape[1]
168
- df_A = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
169
- df_C = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
170
- df_G = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
171
- df_T = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
172
- df_N = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
173
-
174
- for read_name, one_hot_array in one_hot_reads.items():
175
- one_hot_array = one_hot_array.reshape(n_rows_OHE, -1)
176
- dict_A[read_name] = one_hot_array[0, :]
177
- dict_C[read_name] = one_hot_array[1, :]
178
- dict_G[read_name] = one_hot_array[2, :]
179
- dict_T[read_name] = one_hot_array[3, :]
180
- dict_N[read_name] = one_hot_array[4, :]
181
-
182
- del one_hot_reads
183
- gc.collect()
184
-
185
- for j, read_name in tqdm(enumerate(sorted_index), desc='Loading dataframes of OHE reads', total=len(sorted_index)):
186
- df_A.iloc[j] = dict_A[read_name]
187
- df_C.iloc[j] = dict_C[read_name]
188
- df_G.iloc[j] = dict_G[read_name]
189
- df_T.iloc[j] = dict_T[read_name]
190
- df_N.iloc[j] = dict_N[read_name]
191
-
192
- del dict_A, dict_C, dict_G, dict_T, dict_N
193
- gc.collect()
194
-
195
- ohe_df_map = {0: df_A, 1: df_C, 2: df_G, 3: df_T, 4: df_N}
196
-
197
- for j, base in enumerate(['A', 'C', 'G', 'T', 'N']):
198
- adata.layers[f'{base}_binary_encoding'] = ohe_df_map[j].values
199
- ohe_df_map[j] = None # Reassign pointer for memory usage purposes
200
-
201
- if final_adata:
202
- if adata.shape[0] > 0:
203
- final_adata = ad.concat([final_adata, adata], join='outer', index_unique=None)
204
- else:
205
- print(f"{sample} did not have any mapped reads on {record}, omiting from final adata")
206
- else:
207
- if adata.shape[0] > 0:
208
- final_adata = adata
209
- else:
210
- print(f"{sample} did not have any mapped reads on {record}, omiting from final adata")
211
-
212
- else:
213
- print(f"{sample} did not have any mapped reads on {record}, omiting from final adata")
214
-
215
- # Set obs columns to type 'category'
216
- for col in final_adata.obs.columns:
217
- final_adata.obs[col] = final_adata.obs[col].astype('category')
218
-
219
- for record in records_to_analyze:
220
- unconverted_record_name = "_".join(record.split('_')[:-2]) + '_unconverted_top'
221
- sequence = record_FASTA_dict[unconverted_record_name][0]
222
- complement = record_FASTA_dict[unconverted_record_name][1]
223
- chromosome = record_FASTA_dict[unconverted_record_name][2]
224
- final_adata.var[f'{chromosome}_unconverted_top_strand_FASTA_base'] = list(sequence)
225
- final_adata.var[f'{chromosome}_unconverted_bottom_strand_FASTA_base'] = list(complement)
226
- final_adata.uns[f'{record}_FASTA_sequence'] = sequence
227
-
228
- ######################################################################################################
229
-
230
- ######################################################################################################
231
- ## Export the final adata object
232
- final_output = os.path.join(h5_dir, f'{readwrite.date_string()}_{experiment_name}.h5ad.gz')
233
- final_adata.write_h5ad(final_output, compression='gzip')
@@ -1,43 +0,0 @@
1
- ## count_aligned_reads
2
-
3
- # General
4
- def count_aligned_reads(bam_file):
5
- """
6
- Counts the number of aligned reads in a bam file that map to each reference record.
7
-
8
- Parameters:
9
- bam_file (str): A string representing the path to an aligned BAM file.
10
-
11
- Returns:
12
- aligned_reads_count (int): The total number or reads aligned in the BAM.
13
- unaligned_reads_count (int): The total number of reads not aligned in the BAM.
14
- record_counts (dict): A dictionary keyed by reference record instance that points toa tuple containing the total reads mapped to the record and the fraction of mapped reads which map to the record.
15
-
16
- """
17
- from .. import readwrite
18
- import pysam
19
- from tqdm import tqdm
20
- from collections import defaultdict
21
-
22
- print('{0}: Counting aligned reads in BAM > {1}'.format(readwrite.time_string(), bam_file))
23
- aligned_reads_count = 0
24
- unaligned_reads_count = 0
25
- # Make a dictionary, keyed by the reference_name of reference chromosome that points to an integer number of read counts mapped to the chromosome, as well as the proportion of mapped reads in that chromosome
26
- record_counts = defaultdict(int)
27
-
28
- with pysam.AlignmentFile(bam_file, "rb") as bam:
29
- total_reads = bam.mapped + bam.unmapped
30
- # Iterate over reads to get the total mapped read counts and the reads that map to each reference
31
- for read in tqdm(bam, desc='Counting aligned reads in BAM', total=total_reads):
32
- if read.is_unmapped:
33
- unaligned_reads_count += 1
34
- else:
35
- aligned_reads_count += 1
36
- record_counts[read.reference_name] += 1 # Automatically increments if key exists, adds if not
37
-
38
- # reformat the dictionary to contain read counts mapped to the reference, as well as the proportion of mapped reads in reference
39
- for reference in record_counts:
40
- proportion_mapped_reads_in_record = record_counts[reference] / aligned_reads_count
41
- record_counts[reference] = (record_counts[reference], proportion_mapped_reads_in_record)
42
-
43
- return aligned_reads_count, unaligned_reads_count, dict(record_counts)
@@ -1,57 +0,0 @@
1
- ## extract_base_identities
2
-
3
- # General
4
- def extract_base_identities(bam_file, chromosome, positions, max_reference_length):
5
- """
6
- Extracts the base identities from every position within the mapped reads that have a reference coordinate
7
-
8
- Parameters:
9
- bam (str): File path to the BAM file to align (excluding the file suffix).
10
- chromosome (str): A string representing the name of the record within the reference FASTA.
11
- positions (list): A list of position coordinates within the record to extract.
12
- max_reference_length (int): The maximum length of a record in the reference set.
13
-
14
- Returns:
15
- fwd_base_identities (dict): A dictionary, keyed by read name, that points to a list of base identities from forward mapped reads. If the read does not contain that position, fill the list at that index with a N value.
16
- rev_base_identities (dict): A dictionary, keyed by read name, that points to a list of base identities from reverse mapped reads. If the read does not contain that position, fill the list at that index with a N value.
17
- """
18
- from .. import readwrite
19
- import pysam
20
- from tqdm import tqdm
21
-
22
- positions = set(positions)
23
- # Initialize a base identity dictionary that will hold key-value pairs that are: key (read-name) and value (list of base identities at positions of interest)
24
- fwd_base_identities = {}
25
- rev_base_identities = {}
26
- # Open the postion sorted BAM file
27
- print('{0}: Reading BAM file: {1}'.format(readwrite.time_string(), bam_file))
28
- with pysam.AlignmentFile(bam_file, "rb") as bam:
29
- # Iterate over every read in the bam that comes from the chromosome of interest
30
- print('{0}: Iterating over reads in bam'.format(readwrite.time_string()))
31
- total_reads = bam.mapped
32
- for read in tqdm(bam.fetch(chromosome), desc='Extracting base identities from reads in BAM', total=total_reads):
33
- # Only iterate over mapped reads
34
- if read.is_mapped:
35
- # Get sequence of read. PySam reports fwd mapped reads as the true read sequence. Pysam reports rev mapped reads as the reverse complement of the read.
36
- query_sequence = read.query_sequence
37
- # If the read aligned as a reverse complement, mark that the read is reversed
38
- if read.is_reverse:
39
- # Initialize the read key in a temp base_identities dictionary by pointing to a N filled list of length reference_length.
40
- rev_base_identities[read.query_name] = ['N'] * max_reference_length
41
- # Iterate over a list of tuples for the given read. The tuples contain the 0-indexed position relative to the read.query_sequence start, as well the 0-based index relative to the reference.
42
- for read_position, reference_position in read.get_aligned_pairs(matches_only=True):
43
- # If the aligned read's reference coordinate is in the positions set and if the read position was successfully mapped
44
- if reference_position in positions and read_position:
45
- # get the base_identity in the read corresponding to that position
46
- rev_base_identities[read.query_name][reference_position] = query_sequence[read_position]
47
- else:
48
- # Initialize the read key in a temp base_identities dictionary by pointing to a N filled list of length reference_length.
49
- fwd_base_identities[read.query_name] = ['N'] * max_reference_length
50
- # Iterate over a list of tuples for the given read. The tuples contain the 0-indexed position relative to the read.query_sequence start, as well the 0-based index relative to the reference.
51
- for read_position, reference_position in read.get_aligned_pairs(matches_only=True):
52
- # If the aligned read's reference coordinate is in the positions set and if the read position was successfully mapped
53
- if reference_position in positions and read_position:
54
- # get the base_identity in the read corresponding to that position
55
- fwd_base_identities[read.query_name][reference_position] = query_sequence[read_position]
56
-
57
- return fwd_base_identities, rev_base_identities
@@ -1,51 +0,0 @@
1
- ## extract_mods
2
-
3
- def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix):
4
- """
5
- Takes all of the aligned, sorted, split modified BAM files and runs Nanopore Modkit Extract to load the modification data into zipped TSV files
6
-
7
- Parameters:
8
- thresholds (list): A list of thresholds to use for marking each basecalled base as passing or failing on canonical and modification call status.
9
- mod_tsv_dir (str): A string representing the file path to the directory to hold the modkit extract outputs.
10
- split_dit (str): A string representing the file path to the directory containing the converted aligned_sorted_split BAM files.
11
- bam_suffix (str): The suffix to use for the BAM file.
12
-
13
- Returns:
14
- None
15
- Runs modkit extract on input aligned_sorted_split modified BAM files to output zipped TSVs containing modification calls.
16
-
17
- """
18
- import os
19
- import subprocess
20
- import glob
21
- import zipfile
22
-
23
- os.chdir(mod_tsv_dir)
24
- filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
25
- bam_files = glob.glob(os.path.join(split_dir, f"*{bam_suffix}"))
26
- for input_file in bam_files:
27
- print(input_file)
28
- # Extract the file basename
29
- file_name = os.path.basename(input_file)
30
- # Construct the output TSV file path
31
- output_tsv_temp = os.path.join(mod_tsv_dir, file_name)
32
- output_tsv = output_tsv_temp.replace(bam_suffix, "") + "_extract.tsv"
33
- # Run modkit summary
34
- subprocess.run(["modkit", "summary", input_file])
35
- # Run modkit extract
36
- subprocess.run([
37
- "modkit", "extract",
38
- "--filter-threshold", f'{filter_threshold}',
39
- "--mod-thresholds", f"m:{m5C_threshold}",
40
- "--mod-thresholds", f"a:{m6A_threshold}",
41
- "--mod-thresholds", f"h:{hm5C_threshold}",
42
- input_file, "null",
43
- "--read-calls", output_tsv
44
- ])
45
- # Zip the output TSV
46
- print(f'zipping {output_tsv}')
47
- with zipfile.ZipFile(f"{output_tsv}.zip", 'w', zipfile.ZIP_DEFLATED) as zipf:
48
- zipf.write(output_tsv, os.path.basename(output_tsv))
49
- # Remove the non-zipped TSV
50
- print(f'removing {output_tsv}')
51
- os.remove(output_tsv)
@@ -1,22 +0,0 @@
1
- # extract_readnames_from_BAM
2
-
3
- def extract_readnames_from_BAM(aligned_BAM):
4
- """
5
- Takes a BAM and writes out a txt file containing read names from the BAM
6
-
7
- Parameters:
8
- aligned_BAM (str): Path to an input aligned_BAM to extract read names from.
9
-
10
- Returns:
11
- None
12
-
13
- """
14
- import subprocess
15
- # Make a text file of reads for the BAM
16
- txt_output = aligned_BAM.split('.bam')[0] + '_read_names.txt'
17
- samtools_view = subprocess.Popen(["samtools", "view", aligned_BAM], stdout=subprocess.PIPE)
18
- with open(txt_output, "w") as output_file:
19
- cut_process = subprocess.Popen(["cut", "-f1"], stdin=samtools_view.stdout, stdout=output_file)
20
- samtools_view.stdout.close()
21
- cut_process.wait()
22
- samtools_view.wait()
@@ -1,61 +0,0 @@
1
- ## find_conversion_sites
2
-
3
- def find_conversion_sites(fasta_file, modification_type, conversion_types):
4
- """
5
- A function to find genomic coordinates in every unconverted record contained within a FASTA file of every cytosine.
6
- If searching for adenine conversions, it will find coordinates of all adenines.
7
-
8
- Parameters:
9
- fasta_file (str): A string representing the file path to the converted reference FASTA.
10
- modification_type (str): A string representing the modification type of interest (options are '5mC' and '6mA').
11
- conversion_types (list): A list of strings of the conversion types to use in the analysis. Used here to pass the unconverted record name.
12
-
13
- Returns:
14
- record_dict (dict): A dictionary keyed by unconverted record ids contained within the FASTA. Points to a list containing: 1) sequence length of the record, 2) top strand coordinate list, 3) bottom strand coorinate list, 4) sequence string, 5) Complement sequence
15
- """
16
- from .. import readwrite
17
- from Bio import SeqIO
18
- from Bio.SeqRecord import SeqRecord
19
- from Bio.Seq import Seq
20
-
21
- #print('{0}: Finding positions of interest in reference FASTA: {1}'.format(readwrite.time_string(), fasta_file))
22
- # Initialize lists to hold top and bottom strand positional coordinates of interest
23
- top_strand_coordinates = []
24
- bottom_strand_coordinates = []
25
- unconverted = conversion_types[0]
26
- record_dict = {}
27
- #print('{0}: Opening FASTA file {1}'.format(readwrite.time_string(), fasta_file))
28
- # Open the FASTA record as read only
29
- with open(fasta_file, "r") as f:
30
- # Iterate over records in the FASTA
31
- for record in SeqIO.parse(f, "fasta"):
32
- # Only iterate over the unconverted records for the reference
33
- if unconverted in record.id:
34
- #print('{0}: Iterating over record {1} in FASTA file {2}'.format(readwrite.time_string(), record, fasta_file))
35
- # Extract the sequence string of the record
36
- sequence = str(record.seq).upper()
37
- complement = str(record.seq.complement()).upper()
38
- sequence_length = len(sequence)
39
- if modification_type == '5mC':
40
- # Iterate over the sequence string from the record
41
- for i in range(0, len(sequence)):
42
- if sequence[i] == 'C':
43
- top_strand_coordinates.append(i) # 0-indexed coordinate
44
- if sequence[i] == 'G':
45
- bottom_strand_coordinates.append(i) # 0-indexed coordinate
46
- #print('{0}: Returning zero-indexed top and bottom strand FASTA coordinates for all cytosines'.format(readwrite.time_string()))
47
- elif modification_type == '6mA':
48
- # Iterate over the sequence string from the record
49
- for i in range(0, len(sequence)):
50
- if sequence[i] == 'A':
51
- top_strand_coordinates.append(i) # 0-indexed coordinate
52
- if sequence[i] == 'T':
53
- bottom_strand_coordinates.append(i) # 0-indexed coordinate
54
- #print('{0}: Returning zero-indexed top and bottom strand FASTA coordinates for adenines of interest'.format(readwrite.time_string()))
55
- else:
56
- #print('modification_type not found. Please try 5mC or 6mA')
57
- pass
58
- record_dict[record.id] = [sequence_length, top_strand_coordinates, bottom_strand_coordinates, sequence, complement]
59
- else:
60
- pass
61
- return record_dict
@@ -1,98 +0,0 @@
1
- ## generate_converted_FASTA
2
-
3
- def convert_FASTA_record(record, modification_type, strand, unconverted):
4
- """
5
- Takes a FASTA record and converts every instance of a base to the converted state.
6
-
7
- Parameters:
8
- record (str): The name of the record instance within the FASTA.
9
- modification_type (str): The modification type to convert for (options are '5mC' and '6mA').
10
- strand (str): The strand that is being converted in the experiment (options are 'top' and 'bottom').
11
- Returns:
12
- new_seq (str): Converted sequence string.
13
- new_id (str): Record id for the converted sequence string.
14
- """
15
- if modification_type == '5mC':
16
- if strand == 'top':
17
- # Replace every 'C' with 'T' in the sequence
18
- new_seq = record.seq.upper().replace('C', 'T')
19
- elif strand == 'bottom':
20
- # Replace every 'G' with 'A' in the sequence
21
- new_seq = record.seq.upper().replace('G', 'A')
22
- else:
23
- print('need to provide a valid strand string: top or bottom')
24
- new_id = '{0}_{1}_{2}'.format(record.id, modification_type, strand)
25
- elif modification_type == '6mA':
26
- if strand == 'top':
27
- # Replace every 'A' with 'G' in the sequence
28
- new_seq = record.seq.upper().replace('A', 'G')
29
- elif strand == 'bottom':
30
- # Replace every 'T' with 'C' in the sequence
31
- new_seq = record.seq.upper().replace('T', 'C')
32
- else:
33
- print('need to provide a valid strand string: top or bottom')
34
- new_id = '{0}_{1}_{2}'.format(record.id, modification_type, strand)
35
- elif modification_type == unconverted:
36
- new_seq = record.seq.upper()
37
- new_id = '{0}_{1}_top'.format(record.id, modification_type)
38
- else:
39
- print(f'need to provide a valid modification_type string: 5mC, 6mA, or {unconverted}')
40
-
41
- return new_seq, new_id
42
-
43
- def generate_converted_FASTA(input_fasta, modification_types, strands, output_fasta):
44
- """
45
- Uses modify_sequence_and_id function on every record within the FASTA to write out a converted FASTA.
46
-
47
- Parameters:
48
- input_FASTA (str): A string representing the path to the unconverted FASTA file.
49
- modification_types (list): A list of modification types to use in the experiment.
50
- strands (list): A list of converstion strands to use in the experiment.
51
- output_FASTA (str): A string representing the path to the converted FASTA output file.
52
- Returns:
53
- None
54
- Writes out a converted FASTA reference for the experiment.
55
- """
56
- from .. import readwrite
57
- from Bio import SeqIO
58
- from Bio.SeqRecord import SeqRecord
59
- from Bio.Seq import Seq
60
- import gzip
61
- modified_records = []
62
- unconverted = modification_types[0]
63
- # Iterate over each record in the input FASTA
64
- if '.gz' in input_fasta:
65
- with gzip.open(input_fasta, 'rt') as handle:
66
- for record in SeqIO.parse(handle, 'fasta'):
67
- record_description = record.description
68
- # Iterate over each modification type of interest
69
- for modification_type in modification_types:
70
- # Iterate over the strands of interest
71
- for i, strand in enumerate(strands):
72
- if i > 0 and modification_type == unconverted: # This ensures that the unconverted is only added once.
73
- pass
74
- else:
75
- # Add the modified record to the list of modified records
76
- print(f'converting {modification_type} on the {strand} strand of record {record}')
77
- new_seq, new_id = convert_FASTA_record(record, modification_type, strand, unconverted)
78
- new_record = SeqRecord(Seq(new_seq), id=new_id, description=record_description)
79
- modified_records.append(new_record)
80
- else:
81
- for record in SeqIO.parse(input_fasta, 'fasta'):
82
- record_description = record.description
83
- # Iterate over each modification type of interest
84
- for modification_type in modification_types:
85
- # Iterate over the strands of interest
86
- for i, strand in enumerate(strands):
87
- if i > 0 and modification_type == unconverted: # This ensures that the unconverted is only added once.
88
- pass
89
- else:
90
- # Add the modified record to the list of modified records
91
- print(f'converting {modification_type} on the {strand} strand of record {record}')
92
- new_seq, new_id = convert_FASTA_record(record, modification_type, strand, unconverted)
93
- new_record = SeqRecord(Seq(new_seq), id=new_id, description=record_description)
94
- modified_records.append(new_record)
95
-
96
- with open(output_fasta, 'w') as output_handle:
97
- # write out the concatenated FASTA file of modified sequences
98
- SeqIO.write(modified_records, output_handle, 'fasta')
@@ -1,32 +0,0 @@
1
- # get_chromosome_lengths
2
-
3
- def get_chromosome_lengths(fasta):
4
- """
5
- Generates a file containing chromosome lengths within an input FASTA.
6
-
7
- Parameters:
8
- fasta (str): Path to the input fasta
9
- """
10
- import os
11
- import subprocess
12
- from .index_fasta import index_fasta
13
-
14
- # Make a fasta index file if one isn't already available
15
- index_path = f'{fasta}.fai'
16
- if os.path.exists(index_path):
17
- print(f'Using existing fasta index file: {index_path}')
18
- else:
19
- index_fasta(fasta)
20
-
21
- parent_dir = os.path.dirname(fasta)
22
- fasta_basename = os.path.basename(fasta)
23
- chrom_basename = fasta_basename.split('.fa')[0] + '.chrom.sizes'
24
- chrom_path = os.path.join(parent_dir, chrom_basename)
25
-
26
- # Make a chromosome length file
27
- if os.path.exists(chrom_path):
28
- print(f'Using existing chrom length index file: {chrom_path}')
29
- else:
30
- with open(chrom_path, 'w') as outfile:
31
- command = ["cut", "-f1,2", index_path]
32
- subprocess.run(command, stdout=outfile)