smftools 0.1.1__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. smftools-0.1.6.dist-info/METADATA +127 -0
  2. smftools-0.1.6.dist-info/RECORD +4 -0
  3. smftools/__init__.py +0 -25
  4. smftools/_settings.py +0 -19
  5. smftools/_version.py +0 -1
  6. smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
  7. smftools/datasets/__init__.py +0 -9
  8. smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
  9. smftools/datasets/datasets.py +0 -27
  10. smftools/informatics/__init__.py +0 -12
  11. smftools/informatics/bam_conversion.py +0 -47
  12. smftools/informatics/bam_direct.py +0 -49
  13. smftools/informatics/basecalls_to_adata.py +0 -42
  14. smftools/informatics/fast5_to_pod5.py +0 -19
  15. smftools/informatics/helpers/LoadExperimentConfig.py +0 -74
  16. smftools/informatics/helpers/__init__.py +0 -42
  17. smftools/informatics/helpers/align_and_sort_BAM.py +0 -52
  18. smftools/informatics/helpers/archived/informatics.py +0 -260
  19. smftools/informatics/helpers/archived/load_adata.py +0 -516
  20. smftools/informatics/helpers/binarize_converted_base_identities.py +0 -31
  21. smftools/informatics/helpers/canoncall.py +0 -23
  22. smftools/informatics/helpers/converted_BAM_to_adata.py +0 -164
  23. smftools/informatics/helpers/count_aligned_reads.py +0 -39
  24. smftools/informatics/helpers/extract_base_identities.py +0 -43
  25. smftools/informatics/helpers/extract_mods.py +0 -51
  26. smftools/informatics/helpers/find_conversion_sites.py +0 -59
  27. smftools/informatics/helpers/generate_converted_FASTA.py +0 -79
  28. smftools/informatics/helpers/get_native_references.py +0 -28
  29. smftools/informatics/helpers/make_dirs.py +0 -21
  30. smftools/informatics/helpers/make_modbed.py +0 -27
  31. smftools/informatics/helpers/modQC.py +0 -27
  32. smftools/informatics/helpers/modcall.py +0 -26
  33. smftools/informatics/helpers/modkit_extract_to_adata.py +0 -367
  34. smftools/informatics/helpers/one_hot_encode.py +0 -19
  35. smftools/informatics/helpers/separate_bam_by_bc.py +0 -41
  36. smftools/informatics/helpers/split_and_index_BAM.py +0 -29
  37. smftools/informatics/pod5_conversion.py +0 -53
  38. smftools/informatics/pod5_direct.py +0 -55
  39. smftools/informatics/pod5_to_adata.py +0 -40
  40. smftools/informatics/readwrite.py +0 -106
  41. smftools/informatics/subsample_pod5.py +0 -48
  42. smftools/plotting/__init__.py +0 -0
  43. smftools/preprocessing/__init__.py +0 -29
  44. smftools/preprocessing/append_C_context.py +0 -46
  45. smftools/preprocessing/archives/preprocessing.py +0 -614
  46. smftools/preprocessing/binarize_on_Youden.py +0 -42
  47. smftools/preprocessing/binary_layers_to_ohe.py +0 -30
  48. smftools/preprocessing/calculate_complexity.py +0 -71
  49. smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -45
  50. smftools/preprocessing/calculate_coverage.py +0 -41
  51. smftools/preprocessing/calculate_pairwise_hamming_distances.py +0 -27
  52. smftools/preprocessing/calculate_position_Youden.py +0 -104
  53. smftools/preprocessing/calculate_read_length_stats.py +0 -32
  54. smftools/preprocessing/clean_NaN.py +0 -38
  55. smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -27
  56. smftools/preprocessing/filter_reads_on_length.py +0 -39
  57. smftools/preprocessing/invert_adata.py +0 -22
  58. smftools/preprocessing/mark_duplicates.py +0 -119
  59. smftools/preprocessing/min_non_diagonal.py +0 -25
  60. smftools/preprocessing/remove_duplicates.py +0 -18
  61. smftools/readwrite.py +0 -106
  62. smftools/tools/__init__.py +0 -0
  63. smftools-0.1.1.dist-info/METADATA +0 -88
  64. smftools-0.1.1.dist-info/RECORD +0 -64
  65. {smftools-0.1.1.dist-info → smftools-0.1.6.dist-info}/WHEEL +0 -0
  66. {smftools-0.1.1.dist-info → smftools-0.1.6.dist-info}/licenses/LICENSE +0 -0
@@ -1,164 +0,0 @@
1
- ## converted_BAM_to_adata
2
-
3
- def converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experiment_name, conversion_types, bam_suffix):
4
- """
5
- A wrapper function to take converted aligned_sorted_split BAM files and format the data into an anndata object.
6
-
7
- Parameters:
8
- converted_FASTA (str): A string representing the file path to the converted FASTA reference.
9
- split_dir (str): A string representing the file path to the directory containing the converted aligned_sorted_split BAM files.
10
- mapping_threshold (float): A value in between 0 and 1 to threshold the minimal fraction of aligned reads which map to the reference region. References with values above the threshold are included in the output adata.
11
- experiment_name (str): A string to provide an experiment name to the output adata file.
12
- conversion_types (list): A list of strings of the conversion types to use in the analysis.
13
- bam_suffix (str): The suffix to use for the BAM file.
14
-
15
- Returns:
16
- None
17
- Outputs a single gzipped adata object for the experiment.
18
- """
19
- from .. import readwrite
20
- from .binarize_converted_base_identities import binarize_converted_base_identities
21
- from .find_conversion_sites import find_conversion_sites
22
- from .count_aligned_reads import count_aligned_reads
23
- from .extract_base_identities import extract_base_identities
24
- from .one_hot_encode import one_hot_encode
25
- import pandas as pd
26
- import numpy as np
27
- import anndata as ad
28
- import os
29
-
30
- # Get all of the input BAM files
31
- files = os.listdir(split_dir)
32
- # Change directory to the BAM directory
33
- os.chdir(split_dir)
34
- # Filter file names that contain the search string in their filename and keep them in a list
35
- bams = [bam for bam in files if bam_suffix in bam and '.bai' not in bam]
36
- # Sort file list by names and print the list of file names
37
- bams.sort()
38
- print(f'Found the following BAMS: {bams}')
39
- final_adata = None
40
-
41
- # Make a dictionary, keyed by modification type, that points to another dictionary of unconverted_record_ids. This points to a list of: 1) record length, 2) top strand conversion coordinates, 3) bottom strand conversion coordinates, 4) record sequence
42
- modification_dict = {}
43
- # While populating the dictionary, also extract the longest sequence record in the input references
44
- max_reference_length = 0
45
- for conversion_type in conversion_types:
46
- modification_dict[conversion_type] = find_conversion_sites(converted_FASTA, conversion_type, conversion_types)
47
- for record in modification_dict[conversion_type].keys():
48
- if modification_dict[conversion_type][record][0] > max_reference_length:
49
- max_reference_length = modification_dict[conversion_type][record][0]
50
-
51
- # Init a dict to be keyed by FASTA record that points to the sequence string of the unconverted record
52
- record_FASTA_dict = {}
53
-
54
- # Iterate over the experiment BAM files
55
- for bam_index, bam in enumerate(bams):
56
- # Give each bam a sample name
57
- sample = bam.split(sep=bam_suffix)[0]
58
- # look at aligned read proportions in the bam
59
- aligned_reads_count, unaligned_reads_count, record_counts = count_aligned_reads(bam)
60
- percent_aligned = aligned_reads_count*100 / (aligned_reads_count+unaligned_reads_count)
61
- print(f'{percent_aligned} percent of total reads in {bam} aligned successfully')
62
- records_to_analyze = []
63
- # Iterate over converted reference strands and decide which to use in the analysis based on the mapping_threshold
64
- for record in record_counts:
65
- print(f'{record_counts[record][0]} reads mapped to reference record {record}. This is {record_counts[record][1]*100} percent of all mapped reads in the sample.')
66
- if record_counts[record][1] >= mapping_threshold:
67
- records_to_analyze.append(record)
68
- print(f'Records to analyze: {records_to_analyze}')
69
- # Iterate over records to analyze (ie all conversions detected)
70
- for record in records_to_analyze:
71
- mod_type, strand = record.split('_')[-2:]
72
- if strand == 'top':
73
- strand_index = 1
74
- elif strand == 'bottom':
75
- strand_index = 2
76
-
77
- chromosome = record.split('_{0}_{1}'.format(mod_type, strand))[0]
78
- unconverted_chromosome_name = f'{chromosome}_{conversion_types[0]}_top'
79
- positions = modification_dict[mod_type][unconverted_chromosome_name][strand_index]
80
- current_reference_length = modification_dict[mod_type][unconverted_chromosome_name][0]
81
- delta_max_length = max_reference_length - current_reference_length
82
- sequence = modification_dict[mod_type][unconverted_chromosome_name][3] + 'N'*delta_max_length
83
- record_FASTA_dict[f'{record}'] = sequence
84
- print(f'Chromosome: {chromosome}\nUnconverted Sequence: {sequence}')
85
-
86
- # Get a dictionary of positional identities keyed by read id
87
- print(f'Extracting base identities of target positions')
88
- target_base_identities = extract_base_identities(bam, record, positions, max_reference_length)
89
- # binarize the dictionary of positional identities
90
- print(f'Binarizing base identities of target positions')
91
- binarized_base_identities = binarize_converted_base_identities(target_base_identities, strand, mod_type)
92
- # converts the base identity dictionary to a dataframe.
93
- binarized_base_identities_df = pd.DataFrame.from_dict(binarized_base_identities, orient='index')
94
- sorted_index = sorted(binarized_base_identities_df.index)
95
- binarized_base_identities_df = binarized_base_identities_df.reindex(sorted_index)
96
- # Get the sequence string of every read
97
- print(f'Extracting base identities of all positions in each read')
98
- all_base_identities = extract_base_identities(bam, record, range(current_reference_length), max_reference_length)
99
- # One hot encode the sequence string of the reads
100
- print(f'One hot encoding base identities of all positions in each read')
101
- one_hot_reads = {read_name: one_hot_encode(seq) for read_name, seq in all_base_identities.items()}
102
-
103
- # Initialize empty DataFrames for each base
104
- read_names = list(one_hot_reads.keys())
105
- sequence_length = one_hot_reads[read_names[0]].shape[0]
106
- df_A = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
107
- df_C = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
108
- df_G = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
109
- df_T = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
110
- df_N = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
111
-
112
- # Iterate through the dictionary and populate the DataFrames
113
- for read_name, one_hot_array in one_hot_reads.items():
114
- df_A.loc[read_name] = one_hot_array[:, 0]
115
- df_C.loc[read_name] = one_hot_array[:, 1]
116
- df_G.loc[read_name] = one_hot_array[:, 2]
117
- df_T.loc[read_name] = one_hot_array[:, 3]
118
- df_N.loc[read_name] = one_hot_array[:, 4]
119
-
120
- ohe_df_map = {0: df_A, 1: df_C, 2: df_G, 3: df_T, 4: df_N}
121
-
122
- # Load an anndata object with the sample data
123
- X = binarized_base_identities_df.values
124
- adata = ad.AnnData(X, dtype=X.dtype)
125
- adata.obs_names = binarized_base_identities_df.index
126
- adata.obs_names = adata.obs_names.astype(str)
127
- adata.var_names = binarized_base_identities_df.columns
128
- adata.var_names = adata.var_names.astype(str)
129
- adata.obs['Sample'] = [sample] * len(adata)
130
- adata.obs['Strand'] = [strand] * len(adata)
131
- adata.obs['Dataset'] = [mod_type] * len(adata)
132
- adata.obs['Reference'] = [record] * len(adata)
133
- adata.obs['Reference_chromosome'] = [chromosome] * len(adata)
134
-
135
- for j, base in enumerate(['A', 'C', 'G', 'T', 'N']):
136
- adata.layers[f'{base}_binary_encoding'] = ohe_df_map[j].values
137
-
138
- if final_adata:
139
- final_adata = ad.concat([final_adata, adata], join='outer', index_unique=None)
140
- else:
141
- final_adata = adata
142
-
143
- for record in record_FASTA_dict.keys():
144
- chromosome = record.split('_')[0]
145
- sequence = record_FASTA_dict[record]
146
- final_adata.uns[f'{record}_FASTA_sequence'] = sequence
147
- final_adata.var[f'{record}_FASTA_sequence'] = list(sequence)
148
-
149
- # May need to remove the bottom for conversion SMF
150
- record_subset = final_adata[final_adata.obs['Reference'] == record].copy()
151
- layer_map, layer_counts = {}, []
152
- for i, layer in enumerate(record_subset.layers):
153
- layer_map[i] = layer.split('_')[0]
154
- layer_counts.append(np.sum(record_subset.layers[layer], axis=0))
155
- count_array = np.array(layer_counts)
156
- nucleotide_indexes = np.argmax(count_array, axis=0)
157
- consensus_sequence_list = [layer_map[i] for i in nucleotide_indexes]
158
- final_adata.var[f'{record}_consensus_across_samples'] = consensus_sequence_list
159
-
160
- ######################################################################################################
161
-
162
- ######################################################################################################
163
- ## Export the final adata object
164
- final_adata.write_h5ad('{0}_{1}.h5ad.gz'.format(readwrite.date_string(), experiment_name), compression='gzip')
@@ -1,39 +0,0 @@
1
- ## count_aligned_reads
2
-
3
- # General
4
- def count_aligned_reads(bam_file):
5
- """
6
- Counts the number of aligned reads in a bam file that map to each reference record.
7
-
8
- Parameters:
9
- bam_file (str): A string representing the path to an aligned BAM file.
10
-
11
- Returns:
12
- aligned_reads_count (int): The total number or reads aligned in the BAM.
13
- unaligned_reads_count (int): The total number of reads not aligned in the BAM.
14
- record_counts (dict): A dictionary keyed by reference record instance that points toa tuple containing the total reads mapped to the record and the fraction of mapped reads which map to the record.
15
-
16
- """
17
- from .. import readwrite
18
- import pysam
19
- print('{0}: Counting aligned reads in BAM > {1}'.format(readwrite.time_string(), bam_file))
20
- aligned_reads_count = 0
21
- unaligned_reads_count = 0
22
- # Make a dictionary, keyed by the reference_name of reference chromosome that points to an integer number of read counts mapped to the chromosome, as well as the proportion of mapped reads in that chromosome
23
- record_counts = {}
24
- with pysam.AlignmentFile(bam_file, "rb") as bam:
25
- # Iterate over reads to get the total mapped read counts and the reads that map to each reference
26
- for read in bam:
27
- if read.is_unmapped:
28
- unaligned_reads_count += 1
29
- else:
30
- aligned_reads_count += 1
31
- if read.reference_name in record_counts:
32
- record_counts[read.reference_name] += 1
33
- else:
34
- record_counts[read.reference_name] = 1
35
- # reformat the dictionary to contain read counts mapped to the reference, as well as the proportion of mapped reads in reference
36
- for reference in record_counts:
37
- proportion_mapped_reads_in_record = record_counts[reference] / aligned_reads_count
38
- record_counts[reference] = (record_counts[reference], proportion_mapped_reads_in_record)
39
- return aligned_reads_count, unaligned_reads_count, record_counts
@@ -1,43 +0,0 @@
1
- ## extract_base_identities
2
-
3
- # General
4
- def extract_base_identities(bam_file, chromosome, positions, max_reference_length):
5
- """
6
- Extracts the base identities from every position within the read that has a reference coordinate
7
-
8
- Parameters:
9
- bam (str): File path to the BAM file to align (excluding the file suffix).
10
- chromosome (str): A string representing the name of the record within the reference FASTA.
11
- positions (list): A list of position coordinates within the record to extract.
12
- max_reference_length (int): The maximum length of a record in the reference set.
13
-
14
- Returns:
15
- base_identities (dict): A dictionary, keyed by read name, that points to a list of base identities. If the read does not contain that position, fill the list at that index with a N value.
16
-
17
- """
18
- from .. import readwrite
19
- import pysam
20
- positions = set(positions)
21
- # Initialize a base identity dictionary that will hold key-value pairs that are: key (read-name) and value (list of base identities at positions of interest)
22
- base_identities = {}
23
- # Open the postion sorted BAM file
24
- print('{0}: Reading BAM file: {1}'.format(readwrite.time_string(), bam_file))
25
- with pysam.AlignmentFile(bam_file, "rb") as bam:
26
- # Iterate over every read in the bam that comes from the chromosome of interest
27
- print('{0}: Iterating over reads in bam'.format(readwrite.time_string()))
28
- for read in bam.fetch(chromosome):
29
- if read.query_name in base_identities:
30
- pass
31
- #print('Duplicate read found in BAM for read {}. Skipping duplicate'.format(read.query_name))
32
- else:
33
- # Initialize the read key in the base_identities dictionary by pointing to a N filled list of length reference_length
34
- base_identities[read.query_name] = ['N'] * max_reference_length
35
- # Iterate over a list of tuples for the given read. The tuples contain the 0-indexed position relative to the read start, as well the 0-based index relative to the reference.
36
- for read_position, reference_position in read.get_aligned_pairs():
37
- # If the aligned read's reference coordinate is in the positions set and if the read position was successfully mapped
38
- if reference_position in positions and read_position:
39
- # get the base_identity in the read corresponding to that position
40
- base_identity = read.query_sequence[read_position]
41
- # Add the base identity to array
42
- base_identities[read.query_name][reference_position] = base_identity
43
- return base_identities
@@ -1,51 +0,0 @@
1
- ## extract_mods
2
-
3
- def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix):
4
- """
5
- Takes all of the aligned, sorted, split modified BAM files and runs Nanopore Modkit Extract to load the modification data into zipped TSV files
6
-
7
- Parameters:
8
- thresholds (list): A list of thresholds to use for marking each basecalled base as passing or failing on canonical and modification call status.
9
- mod_tsv_dir (str): A string representing the file path to the directory to hold the modkit extract outputs.
10
- split_dit (str): A string representing the file path to the directory containing the converted aligned_sorted_split BAM files.
11
- bam_suffix (str): The suffix to use for the BAM file.
12
-
13
- Returns:
14
- None
15
- Runs modkit extract on input aligned_sorted_split modified BAM files to output zipped TSVs containing modification calls.
16
-
17
- """
18
- import os
19
- import subprocess
20
- import glob
21
- import zipfile
22
-
23
- os.chdir(mod_tsv_dir)
24
- filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
25
- bam_files = glob.glob(os.path.join(split_dir, f"*{bam_suffix}"))
26
- for input_file in bam_files:
27
- print(input_file)
28
- # Extract the file basename
29
- file_name = os.path.basename(input_file)
30
- # Construct the output TSV file path
31
- output_tsv_temp = os.path.join(mod_tsv_dir, file_name)
32
- output_tsv = output_tsv_temp.replace(bam_suffix, "") + "_extract.tsv"
33
- # Run modkit summary
34
- subprocess.run(["modkit", "summary", input_file])
35
- # Run modkit extract
36
- subprocess.run([
37
- "modkit", "extract",
38
- "--filter-threshold", f'{filter_threshold}',
39
- "--mod-thresholds", f"m:{m5C_threshold}",
40
- "--mod-thresholds", f"a:{m6A_threshold}",
41
- "--mod-thresholds", f"h:{hm5C_threshold}",
42
- input_file, "null",
43
- "--read-calls", output_tsv
44
- ])
45
- # Zip the output TSV
46
- print(f'zipping {output_tsv}')
47
- with zipfile.ZipFile(f"{output_tsv}.zip", 'w', zipfile.ZIP_DEFLATED) as zipf:
48
- zipf.write(output_tsv, os.path.basename(output_tsv))
49
- # Remove the non-zipped TSV
50
- print(f'removing {output_tsv}')
51
- os.remove(output_tsv)
@@ -1,59 +0,0 @@
1
- ## find_conversion_sites
2
-
3
- def find_conversion_sites(fasta_file, modification_type, conversion_types):
4
- """
5
- A function to find genomic coordinates in every unconverted record contained within a FASTA file of every cytosine.
6
- If searching for adenine conversions, it will find coordinates of all adenines.
7
-
8
- Parameters:
9
- fasta_file (str): A string representing the file path to the unconverted reference FASTA.
10
- modification_type (str): A string representing the modification type of interest (options are '5mC' and '6mA').
11
- conversion_types (list): A list of strings of the conversion types to use in the analysis. Used here to pass the unconverted record name.
12
-
13
- Returns:
14
- record_dict (dict): A dictionary keyed by unconverted record ids contained within the FASTA. Points to a list containing: 1) sequence length of the record, 2) top strand coordinate list, 3) bottom strand coorinate list, 4) sequence string
15
- """
16
- from .. import readwrite
17
- from Bio import SeqIO
18
- from Bio.SeqRecord import SeqRecord
19
- from Bio.Seq import Seq
20
-
21
- print('{0}: Finding positions of interest in reference FASTA > {1}'.format(readwrite.time_string(), fasta_file))
22
- # Initialize lists to hold top and bottom strand positional coordinates of interest
23
- top_strand_coordinates = []
24
- bottom_strand_coordinates = []
25
- unconverted = conversion_types[0]
26
- record_dict = {}
27
- print('{0}: Opening FASTA file {1}'.format(readwrite.time_string(), fasta_file))
28
- # Open the FASTA record as read only
29
- with open(fasta_file, "r") as f:
30
- # Iterate over records in the FASTA
31
- for record in SeqIO.parse(f, "fasta"):
32
- # Only iterate over the unconverted records for the reference
33
- if unconverted in record.id:
34
- print('{0}: Iterating over record {1} in FASTA file {2}'.format(readwrite.time_string(), record, fasta_file))
35
- # Extract the sequence string of the record
36
- sequence = str(record.seq).upper()
37
- sequence_length = len(sequence)
38
- if modification_type == '5mC':
39
- # Iterate over the sequence string from the record
40
- for i in range(0, len(sequence)):
41
- if sequence[i] == 'C':
42
- top_strand_coordinates.append(i) # 0-indexed coordinate
43
- if sequence[i] == 'G':
44
- bottom_strand_coordinates.append(i) # 0-indexed coordinate
45
- print('{0}: Returning zero-indexed top and bottom strand FASTA coordinates for all cytosines'.format(readwrite.time_string()))
46
- elif modification_type == '6mA':
47
- # Iterate over the sequence string from the record
48
- for i in range(0, len(sequence)):
49
- if sequence[i] == 'A':
50
- top_strand_coordinates.append(i) # 0-indexed coordinate
51
- if sequence[i] == 'T':
52
- bottom_strand_coordinates.append(i) # 0-indexed coordinate
53
- print('{0}: Returning zero-indexed top and bottom strand FASTA coordinates for adenines of interest'.format(readwrite.time_string()))
54
- else:
55
- print('modification_type not found. Please try 5mC or 6mA')
56
- record_dict[record.id] = [sequence_length, top_strand_coordinates, bottom_strand_coordinates, sequence]
57
- else:
58
- pass
59
- return record_dict
@@ -1,79 +0,0 @@
1
- ## generate_converted_FASTA
2
-
3
- def convert_FASTA_record(record, modification_type, strand, unconverted):
4
- """
5
- Takes a FASTA record and converts every instance of a base to the converted state.
6
-
7
- Parameters:
8
- record (str): The name of the record instance within the FASTA.
9
- modification_type (str): The modification type to convert for (options are '5mC' and '6mA').
10
- strand (str): The strand that is being converted in the experiment (options are 'top' and 'bottom').
11
- Returns:
12
- new_seq (str): Converted sequence string.
13
- new_id (str): Record id for the converted sequence string.
14
- """
15
- if modification_type == '5mC':
16
- if strand == 'top':
17
- # Replace every 'C' with 'T' in the sequence
18
- new_seq = record.seq.upper().replace('C', 'T')
19
- elif strand == 'bottom':
20
- # Replace every 'G' with 'A' in the sequence
21
- new_seq = record.seq.upper().replace('G', 'A')
22
- else:
23
- print('need to provide a valid strand string: top or bottom')
24
- new_id = '{0}_{1}_{2}'.format(record.id, modification_type, strand)
25
- elif modification_type == '6mA':
26
- if strand == 'top':
27
- # Replace every 'A' with 'G' in the sequence
28
- new_seq = record.seq.upper().replace('A', 'G')
29
- elif strand == 'bottom':
30
- # Replace every 'T' with 'C' in the sequence
31
- new_seq = record.seq.upper().replace('T', 'C')
32
- else:
33
- print('need to provide a valid strand string: top or bottom')
34
- new_id = '{0}_{1}_{2}'.format(record.id, modification_type, strand)
35
- elif modification_type == unconverted:
36
- new_seq = record.seq.upper()
37
- new_id = '{0}_{1}_top'.format(record.id, modification_type)
38
- else:
39
- print(f'need to provide a valid modification_type string: 5mC, 6mA, or {unconverted}')
40
-
41
- return new_seq, new_id
42
-
43
- def generate_converted_FASTA(input_fasta, modification_types, strands, output_fasta):
44
- """
45
- Uses modify_sequence_and_id function on every record within the FASTA to write out a converted FASTA.
46
-
47
- Parameters:
48
- input_FASTA (str): A string representing the path to the unconverted FASTA file.
49
- modification_types (list): A list of modification types to use in the experiment.
50
- strands (list): A list of converstion strands to use in the experiment.
51
- output_FASTA (str): A string representing the path to the converted FASTA output file.
52
- Returns:
53
- None
54
- Writes out a converted FASTA reference for the experiment.
55
- """
56
- from .. import readwrite
57
- from Bio import SeqIO
58
- from Bio.SeqRecord import SeqRecord
59
- from Bio.Seq import Seq
60
- modified_records = []
61
- unconverted = modification_types[0]
62
- # Iterate over each record in the input FASTA
63
- for record in SeqIO.parse(input_fasta, 'fasta'):
64
- record_description = record.description
65
- # Iterate over each modification type of interest
66
- for modification_type in modification_types:
67
- # Iterate over the strands of interest
68
- for i, strand in enumerate(strands):
69
- if i > 0 and modification_type == unconverted: # This ensures that the unconverted is only added once.
70
- pass
71
- else:
72
- # Add the modified record to the list of modified records
73
- print(f'converting {modification_type} on the {strand} strand of record {record}')
74
- new_seq, new_id = convert_FASTA_record(record, modification_type, strand, unconverted)
75
- new_record = SeqRecord(Seq(new_seq), id=new_id, description=record_description)
76
- modified_records.append(new_record)
77
- with open(output_fasta, 'w') as output_handle:
78
- # write out the concatenated FASTA file of modified sequences
79
- SeqIO.write(modified_records, output_handle, 'fasta')
@@ -1,28 +0,0 @@
1
- ## get_native_references
2
-
3
- # Direct methylation specific
4
- def get_native_references(fasta_file):
5
- """
6
- Makes a dictionary keyed by record id which points to the record length and record sequence.
7
-
8
- Paramaters:
9
- fasta_file (str): A string representing the path to the FASTA file for the experiment.
10
-
11
- Returns:
12
- None
13
- """
14
- from .. import readwrite
15
- from Bio import SeqIO
16
- from Bio.SeqRecord import SeqRecord
17
- from Bio.Seq import Seq
18
- record_dict = {}
19
- print('{0}: Opening FASTA file {1}'.format(readwrite.time_string(), fasta_file))
20
- # Open the FASTA record as read only
21
- with open(fasta_file, "r") as f:
22
- # Iterate over records in the FASTA
23
- for record in SeqIO.parse(f, "fasta"):
24
- # Extract the sequence string of the record
25
- sequence = str(record.seq).upper()
26
- sequence_length = len(sequence)
27
- record_dict[record.id] = [sequence_length, sequence]
28
- return record_dict
@@ -1,21 +0,0 @@
1
- ## make_dirs
2
-
3
- # General
4
- def make_dirs(directories):
5
- """
6
- Takes a list of file paths and makes new directories if the directory does not already exist.
7
-
8
- Parameters:
9
- directories (list): A list of directories to make
10
-
11
- Returns:
12
- None
13
- """
14
- import os
15
-
16
- for directory in directories:
17
- if not os.path.isdir(directory):
18
- os.mkdir(directory)
19
- print(f"Directory '{directory}' created successfully.")
20
- else:
21
- print(f"Directory '{directory}' already exists.")
@@ -1,27 +0,0 @@
1
- ## make_modbed
2
-
3
- # Direct SMF
4
- def make_modbed(aligned_sorted_output, thresholds, mod_bed_dir):
5
- """
6
- Generating position methylation summaries for each barcoded sample starting from the overall BAM file that was direct output of dorado aligner.
7
- Parameters:
8
- aligned_sorted_output (str): A string representing the file path to the aligned_sorted non-split BAM file.
9
-
10
- Returns:
11
- None
12
- """
13
- import os
14
- import subprocess
15
-
16
- os.chdir(mod_bed_dir)
17
- filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
18
- command = [
19
- "modkit", "pileup", aligned_sorted_output, mod_bed_dir,
20
- "--partition-tag", "BC",
21
- "--only-tabs",
22
- "--filter-threshold", f'{filter_threshold}',
23
- "--mod-thresholds", f"m:{m5C_threshold}",
24
- "--mod-thresholds", f"a:{m6A_threshold}",
25
- "--mod-thresholds", f"h:{hm5C_threshold}"
26
- ]
27
- subprocess.run(command)
@@ -1,27 +0,0 @@
1
- ## modQC
2
-
3
- # Direct SMF
4
- def modQC(aligned_sorted_output, thresholds):
5
- """
6
- Output the percentile of bases falling at a call threshold (threshold is a probability between 0-1) for the overall BAM file.
7
- It is generally good to look at these parameters on positive and negative controls.
8
-
9
- Parameters:
10
- aligned_sorted_output (str): A string representing the file path of the aligned_sorted non-split BAM file output by the dorado aligned.
11
- thresholds (list): A list of floats to pass for call thresholds.
12
-
13
- Returns:
14
- None
15
- """
16
- import subprocess
17
-
18
- filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
19
- subprocess.run(["modkit", "sample-probs", aligned_sorted_output])
20
- command = [
21
- "modkit", "summary", aligned_sorted_output,
22
- "--filter-threshold", f"{filter_threshold}",
23
- "--mod-thresholds", f"m:{m5C_threshold}",
24
- "--mod-thresholds", f"a:{m6A_threshold}",
25
- "--mod-thresholds", f"h:{hm5C_threshold}"
26
- ]
27
- subprocess.run(command)
@@ -1,26 +0,0 @@
1
- ## modcall
2
-
3
- # Direct methylation specific
4
- def modcall(model, pod5_dir, barcode_kit, mod_list, bam, bam_suffix):
5
- """
6
- Wrapper function for dorado modified base calling.
7
-
8
- Parameters:
9
- model (str): a string representing the file path to the dorado basecalling model.
10
- pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
11
- barcode_kit (str): A string representing the barcoding kit used in the experiment.
12
- mod_list (list): A list of modification types to use in the analysis.
13
- bam (str): File path to the BAM file to output.
14
- bam_suffix (str): The suffix to use for the BAM file.
15
-
16
- Returns:
17
- None
18
- Outputs a BAM file holding the modified base calls output by the dorado basecaller.
19
- """
20
- import subprocess
21
- output = bam + bam_suffix
22
- command = [
23
- "dorado", "basecaller", model, pod5_dir, "--kit-name", barcode_kit, "-Y",
24
- "--modified-bases", ",".join(mod_list)] # Join MOD_LIST elements with commas
25
- with open(output, "w") as outfile:
26
- subprocess.run(command, stdout=outfile)