smftools 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. smftools/__init__.py +27 -0
  2. smftools/_settings.py +19 -0
  3. smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
  4. smftools/datasets/__init__.py +9 -0
  5. smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
  6. smftools/datasets/datasets.py +25 -0
  7. smftools/informatics/__init__.py +11 -0
  8. smftools/informatics/helpers/__init__.py +42 -0
  9. smftools/informatics/helpers/align_BAM.py +49 -0
  10. smftools/informatics/helpers/binarize_converted_base_identities.py +24 -0
  11. smftools/informatics/helpers/canoncall.py +12 -0
  12. smftools/informatics/helpers/converted_BAM_to_adata.py +147 -0
  13. smftools/informatics/helpers/count_aligned_reads.py +32 -0
  14. smftools/informatics/helpers/extract_base_identities.py +36 -0
  15. smftools/informatics/helpers/extract_mods.py +39 -0
  16. smftools/informatics/helpers/find_conversion_sites.py +53 -0
  17. smftools/informatics/helpers/generate_converted_FASTA.py +59 -0
  18. smftools/informatics/helpers/get_native_references.py +25 -0
  19. smftools/informatics/helpers/informatics.py +260 -0
  20. smftools/informatics/helpers/load_adata.py +516 -0
  21. smftools/informatics/helpers/load_experiment_config.py +17 -0
  22. smftools/informatics/helpers/make_dirs.py +15 -0
  23. smftools/informatics/helpers/make_modbed.py +21 -0
  24. smftools/informatics/helpers/modQC.py +19 -0
  25. smftools/informatics/helpers/modcall.py +14 -0
  26. smftools/informatics/helpers/modkit_extract_to_adata.py +355 -0
  27. smftools/informatics/helpers/one_hot_encode.py +14 -0
  28. smftools/informatics/helpers/separate_bam_by_bc.py +28 -0
  29. smftools/informatics/helpers/split_and_index_BAM.py +21 -0
  30. smftools/informatics/pod5_conversion.py +26 -0
  31. smftools/informatics/pod5_direct.py +29 -0
  32. smftools/informatics/pod5_to_adata.py +17 -0
  33. smftools/informatics/readwrite.py +109 -0
  34. smftools/plotting/__init__.py +0 -0
  35. smftools/preprocessing/__init__.py +35 -0
  36. smftools/preprocessing/append_C_context.py +39 -0
  37. smftools/preprocessing/binarize_on_Youden.py +38 -0
  38. smftools/preprocessing/binary_layers_to_ohe.py +25 -0
  39. smftools/preprocessing/calculate_complexity.py +59 -0
  40. smftools/preprocessing/calculate_converted_read_methylation_stats.py +38 -0
  41. smftools/preprocessing/calculate_coverage.py +35 -0
  42. smftools/preprocessing/calculate_pairwise_hamming_distances.py +22 -0
  43. smftools/preprocessing/calculate_position_Youden.py +95 -0
  44. smftools/preprocessing/calculate_read_length_stats.py +27 -0
  45. smftools/preprocessing/clean_NaN.py +31 -0
  46. smftools/preprocessing/filter_converted_reads_on_methylation.py +20 -0
  47. smftools/preprocessing/filter_reads_on_length.py +31 -0
  48. smftools/preprocessing/invert_adata.py +18 -0
  49. smftools/preprocessing/mark_duplicates.py +110 -0
  50. smftools/preprocessing/min_non_diagonal.py +20 -0
  51. smftools/preprocessing/preprocessing.py +614 -0
  52. smftools/preprocessing/remove_duplicates.py +12 -0
  53. smftools/readwrite.py +109 -0
  54. smftools/tools/__init__.py +0 -0
  55. smftools-0.1.0.dist-info/METADATA +75 -0
  56. smftools-0.1.0.dist-info/RECORD +58 -0
  57. smftools-0.1.0.dist-info/WHEEL +4 -0
  58. smftools-0.1.0.dist-info/licenses/LICENSE +21 -0
smftools/__init__.py ADDED
@@ -0,0 +1,27 @@
1
+ """smftools"""
2
+
3
+ import logging
4
+ import warnings
5
+
6
+ from anndata import AnnData
7
+ from . import informatics as inform
8
+ from . import preprocessing as pp
9
+ from . import tools as tl
10
+ from . import plotting as pl
11
+ from . import readwrite, datasets
12
+
13
+
14
+ from importlib.metadata import version
15
+
16
+ package_name = "smftools"
17
+ __version__ = version(package_name)
18
+
19
+ __all__ = [
20
+ "AnnData",
21
+ "inform",
22
+ "pp",
23
+ "tl",
24
+ "pl",
25
+ "readwrite",
26
+ "datasets"
27
+ ]
smftools/_settings.py ADDED
@@ -0,0 +1,19 @@
1
+ from pathlib import Path
2
+
3
+ class SMFConfig:
4
+ """\
5
+ Config for smftools.
6
+ """
7
+
8
+ def __init__(
9
+ self,
10
+ *,
11
+ datasetdir: Path | str = "./datasets/"
12
+ ):
13
+ self.datasetdir = datasetdir
14
+
15
+ @property
16
+ def datasetdir(self) -> Path:
17
+ return self._datasetdir
18
+
19
+ settings = SMFConfig()
@@ -0,0 +1,9 @@
1
+ from .datasets import (
2
+ dCas9_kinetics,
3
+ Kissiov_and_McKenna_2025
4
+ )
5
+
6
+ __all__ = [
7
+ "dCas9_kinetics",
8
+ "Kissiov_and_McKenna_2025"
9
+ ]
@@ -0,0 +1,25 @@
1
+ ## datasets
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ import anndata as ad
6
+ from pathlib import Path
7
+
8
+ from .._settings import settings
9
+
10
+ HERE = Path(__file__).parent
11
+
12
+
13
+ def dCas9_kinetics():
14
+ """
15
+
16
+ """
17
+ filepath = HERE / "dCas9_m6A_invitro_kinetics.h5ad.gz"
18
+ return ad.read_h5ad(filepath)
19
+
20
+ def Kissiov_and_McKenna_2025():
21
+ """
22
+
23
+ """
24
+ filepath = HERE / "F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz"
25
+ return ad.read_h5ad(filepath)
@@ -0,0 +1,11 @@
1
+ from . import helpers
2
+ from .pod5_conversion import pod5_conversion
3
+ from .pod5_direct import pod5_direct
4
+ from .pod5_to_adata import pod5_to_adata
5
+
6
+ __all__ = [
7
+ "helpers",
8
+ "pod5_conversion",
9
+ "pod5_direct"
10
+ "pod5_to_adata"
11
+ ]
@@ -0,0 +1,42 @@
1
+ from .align_BAM import align_BAM
2
+ from .binarize_converted_base_identities import binarize_converted_base_identities
3
+ from .canoncall import canoncall
4
+ from .converted_BAM_to_adata import converted_BAM_to_adata
5
+ from .count_aligned_reads import count_aligned_reads
6
+ from .extract_base_identities import extract_base_identities
7
+ from .extract_mods import extract_mods
8
+ from .find_conversion_sites import find_conversion_sites
9
+ from .generate_converted_FASTA import convert_FASTA_record, generate_converted_FASTA
10
+ from .get_native_references import get_native_references
11
+ from .load_experiment_config import load_experiment_config
12
+ from .make_dirs import make_dirs
13
+ from .make_modbed import make_modbed
14
+ from .modcall import modcall
15
+ from .modkit_extract_to_adata import modkit_extract_to_adata
16
+ from .modQC import modQC
17
+ from .one_hot_encode import one_hot_encode
18
+ from .separate_bam_by_bc import separate_bam_by_bc
19
+ from .split_and_index_BAM import split_and_index_BAM
20
+
21
+ __all__ = [
22
+ "align_BAM",
23
+ "binarize_converted_base_identities",
24
+ "canoncall",
25
+ "converted_BAM_to_adata",
26
+ "count_aligned_reads",
27
+ "extract_base_identities",
28
+ "extract_mods",
29
+ "find_conversion_sites",
30
+ "convert_FASTA_record",
31
+ "generate_converted_FASTA",
32
+ "get_native_references",
33
+ "load_experiment_config",
34
+ "make_dirs",
35
+ "make_modbed",
36
+ "modcall",
37
+ "modkit_extract_to_adata",
38
+ "modQC",
39
+ "one_hot_encode",
40
+ "separate_bam_by_bc",
41
+ "split_and_index_BAM"
42
+ ]
@@ -0,0 +1,49 @@
1
+ ## align_BAM
2
+ import subprocess
3
+
4
+ def align_BAM(fasta, bam, bam_suffix):
5
+ """
6
+ A wrapper for running dorado aligner and samtools functions
7
+ """
8
+ aligned_BAM=f"{bam}_aligned"
9
+ aligned_sorted_BAM=f"{aligned_BAM}_sorted"
10
+ output = bam + bam_suffix
11
+ aligned_output = aligned_BAM + bam_suffix
12
+ aligned_sorted_output = aligned_sorted_BAM + bam_suffix
13
+
14
+ # Run dorado aligner
15
+ subprocess.run([
16
+ "dorado", "aligner",
17
+ "--secondary=no",
18
+ fasta,
19
+ output
20
+ ], stdout=open(aligned_output, "w"))
21
+
22
+ # Sort the BAM on positional coordinates
23
+ subprocess.run([
24
+ "samtools", "sort",
25
+ "-o", aligned_sorted_output,
26
+ aligned_output
27
+ ])
28
+
29
+ # Create a BAM index file
30
+ subprocess.run([
31
+ "samtools", "index",
32
+ aligned_sorted_output
33
+ ])
34
+
35
+ # Make a bed file of coordinates for the BAM
36
+ subprocess.run([
37
+ "samtools", "view",
38
+ aligned_sorted_output
39
+ ], stdout=subprocess.PIPE) | subprocess.run([
40
+ "awk", '{print $3, $4, $4+length($10)-1}'
41
+ ], stdin=subprocess.PIPE, stdout=open(f"{aligned_sorted_BAM}_bed.bed", "w"))
42
+
43
+ # Make a text file of reads for the BAM
44
+ subprocess.run([
45
+ "samtools", "view",
46
+ aligned_sorted_output
47
+ ], stdout=subprocess.PIPE) | subprocess.run([
48
+ "cut", "-f1"
49
+ ], stdin=subprocess.PIPE, stdout=open(f"aligned_sorted_BAM_read_names.txt", "w"))
@@ -0,0 +1,24 @@
1
+ ## binarize_converted_base_identities
2
+ import numpy as np
3
+ # Conversion SMF specific
4
+ def binarize_converted_base_identities(base_identities, strand, modification_type):
5
+ """
6
+ Input: The base identities dictionary returned by extract_base_identity_at_coordinates.
7
+ Output: A binarized format of the dictionary, where 1 represents a methylated site. 0 represents an unmethylated site. NaN represents a site that does not carry SMF information.
8
+ """
9
+ binarized_base_identities = {}
10
+ # Iterate over base identity keys to binarize the base identities
11
+ for key in base_identities.keys():
12
+ if strand == 'top':
13
+ if modification_type == '5mC':
14
+ binarized_base_identities[key] = [1 if x == 'C' else 0 if x == 'T' else np.nan for x in base_identities[key]]
15
+ elif modification_type == '6mA':
16
+ binarized_base_identities[key] = [1 if x == 'A' else 0 if x == 'G' else np.nan for x in base_identities[key]]
17
+ elif strand == 'bottom':
18
+ if modification_type == '5mC':
19
+ binarized_base_identities[key] = [1 if x == 'G' else 0 if x == 'A' else np.nan for x in base_identities[key]]
20
+ elif modification_type == '6mA':
21
+ binarized_base_identities[key] = [1 if x == 'T' else 0 if x == 'C' else np.nan for x in base_identities[key]]
22
+ else:
23
+ pass
24
+ return binarized_base_identities
@@ -0,0 +1,12 @@
1
+ ## canoncall
2
+ import subprocess
3
+
4
+ # Conversion SMF specific
5
+ def canoncall(model, pod5_dir, barcode_kit, bam, bam_suffix):
6
+ """
7
+ Wrapper function for dorado canonical base calling.
8
+ """
9
+ output = bam + bam_suffix
10
+ command = ["dorado", "basecaller", model, pod5_dir, "--kit-name", barcode_kit, "-Y"]
11
+ with open(output, "w") as outfile:
12
+ subprocess.run(command, stdout=outfile)
@@ -0,0 +1,147 @@
1
+ ## converted_BAM_to_adata
2
+ from .. import readwrite
3
+ from .binarize_converted_base_identities import binarize_converted_base_identities
4
+ from .find_conversion_sites import find_conversion_sites
5
+ from .count_aligned_reads import count_aligned_reads
6
+ from .extract_base_identities import extract_base_identities
7
+ from .one_hot_encode import one_hot_encode
8
+ import pandas as pd
9
+ import numpy as np
10
+ import anndata as ad
11
+ import os
12
+
13
+ def converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experiment_name, conversion_types, bam_suffix):
14
+ """
15
+
16
+ """
17
+ # Get all of the input BAM files
18
+ files = os.listdir(split_dir)
19
+ # Change directory to the BAM directory
20
+ os.chdir(split_dir)
21
+ # Filter file names that contain the search string in their filename and keep them in a list
22
+ bams = [bam for bam in files if bam_suffix in bam and '.bai' not in bam]
23
+ # Sort file list by names and print the list of file names
24
+ bams.sort()
25
+ print(f'Found the following BAMS: {bams}')
26
+ final_adata = None
27
+
28
+ # Make a dictionary, keyed by modification type, that points to another dictionary of unconverted_record_ids. This points to a list of: 1) record length, 2) top strand conversion coordinates, 3) bottom strand conversion coordinates, 4) record sequence
29
+ modification_dict = {}
30
+ # While populating the dictionary, also extract the longest sequence record in the input references
31
+ max_reference_length = 0
32
+ for conversion_type in conversion_types:
33
+ modification_dict[conversion_type] = find_conversion_sites(converted_FASTA, conversion_type)
34
+ for record in modification_dict[conversion_type].keys():
35
+ if modification_dict[conversion_type][record][0] > max_reference_length:
36
+ max_reference_length = modification_dict[conversion_type][record][0]
37
+
38
+ # Iterate over the experiment BAM files
39
+ for bam_index, bam in enumerate(bams):
40
+ # Give each bam a sample name
41
+ sample = bam.split(sep=bam_suffix)[0]
42
+ # look at aligned read proportions in the bam
43
+ aligned_reads_count, unaligned_reads_count, record_counts = count_aligned_reads(bam)
44
+ percent_aligned = aligned_reads_count*100 / (aligned_reads_count+unaligned_reads_count)
45
+ print(f'{percent_aligned} percent of total reads in {bam} aligned successfully')
46
+ records_to_analyze = []
47
+ # Iterate over converted reference strands and decide which to use in the analysis based on the mapping_threshold
48
+ for record in record_counts:
49
+ print(f'{record_counts[record][0]} reads mapped to reference record {record}. This is {record_counts[record][1]*100} percent of all mapped reads in the sample.')
50
+ if record_counts[record][1] >= mapping_threshold:
51
+ records_to_analyze.append(record)
52
+ print(f'Records to analyze: {records_to_analyze}')
53
+ # Iterate over records to analyze (ie all conversions detected)
54
+ record_FASTA_dict = {}
55
+ for record in records_to_analyze:
56
+ mod_type, strand = record.split('_')[-2:]
57
+ if strand == 'top':
58
+ strand_index = 1
59
+ elif strand == 'bottom':
60
+ strand_index = 2
61
+
62
+ chromosome = record.split('_{0}_{1}'.format(mod_type, strand))[0]
63
+ unconverted_chromosome_name = chromosome + '_unconverted_top'
64
+ positions = modification_dict[mod_type][unconverted_chromosome_name][strand_index]
65
+ current_reference_length = modification_dict[mod_type][unconverted_chromosome_name][0]
66
+ delta_max_length = max_reference_length - current_reference_length
67
+ sequence = modification_dict[mod_type][unconverted_chromosome_name][3] + 'N'*delta_max_length
68
+ record_FASTA_dict[f'{record}'] = sequence
69
+ print(f'Chromosome: {chromosome}\nUnconverted Sequence: {sequence}')
70
+
71
+ # Get a dictionary of positional identities keyed by read id
72
+ print(f'Extracting base identities of target positions')
73
+ target_base_identities = extract_base_identities(bam, record, positions, max_reference_length)
74
+ # binarize the dictionary of positional identities
75
+ print(f'Binarizing base identities of target positions')
76
+ binarized_base_identities = binarize_converted_base_identities(target_base_identities, strand, mod_type)
77
+ # converts the base identity dictionary to a dataframe.
78
+ binarized_base_identities_df = pd.DataFrame.from_dict(binarized_base_identities, orient='index')
79
+ sorted_index = sorted(binarized_base_identities_df.index)
80
+ binarized_base_identities_df = binarized_base_identities_df.reindex(sorted_index)
81
+ # Get the sequence string of every read
82
+ print(f'Extracting base identities of all positions in each read')
83
+ all_base_identities = extract_base_identities(bam, record, range(current_reference_length), max_reference_length)
84
+ # One hot encode the sequence string of the reads
85
+ print(f'One hot encoding base identities of all positions in each read')
86
+ one_hot_reads = {read_name: one_hot_encode(seq) for read_name, seq in all_base_identities.items()}
87
+
88
+ # Initialize empty DataFrames for each base
89
+ read_names = list(one_hot_reads.keys())
90
+ sequence_length = one_hot_reads[read_names[0]].shape[0]
91
+ df_A = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
92
+ df_C = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
93
+ df_G = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
94
+ df_T = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
95
+ df_N = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
96
+
97
+ # Iterate through the dictionary and populate the DataFrames
98
+ for read_name, one_hot_array in one_hot_reads.items():
99
+ df_A.loc[read_name] = one_hot_array[:, 0]
100
+ df_C.loc[read_name] = one_hot_array[:, 1]
101
+ df_G.loc[read_name] = one_hot_array[:, 2]
102
+ df_T.loc[read_name] = one_hot_array[:, 3]
103
+ df_N.loc[read_name] = one_hot_array[:, 4]
104
+
105
+ ohe_df_map = {0: df_A, 1: df_C, 2: df_G, 3: df_T, 4: df_N}
106
+
107
+ # Load an anndata object with the sample data
108
+ X = binarized_base_identities_df.values
109
+ adata = ad.AnnData(X, dtype=X.dtype)
110
+ adata.obs_names = binarized_base_identities_df.index
111
+ adata.obs_names = adata.obs_names.astype(str)
112
+ adata.var_names = binarized_base_identities_df.columns
113
+ adata.var_names = adata.var_names.astype(str)
114
+ adata.obs['Sample'] = [sample] * len(adata)
115
+ adata.obs['Strand'] = [strand] * len(adata)
116
+ adata.obs['Dataset'] = [mod_type] * len(adata)
117
+ adata.obs['Reference'] = [record] * len(adata)
118
+ adata.obs['Reference_chromosome'] = [chromosome] * len(adata)
119
+
120
+ for j, base in enumerate(['A', 'C', 'G', 'T', 'N']):
121
+ adata.layers[f'{base}_binary_encoding'] = ohe_df_map[j].values
122
+
123
+ if final_adata:
124
+ final_adata = ad.concat([final_adata, adata], join='outer', index_unique=None)
125
+ else:
126
+ final_adata = adata
127
+
128
+ for record in record_FASTA_dict.keys():
129
+ chromosome = record.split('_')[0]
130
+ sequence = record_FASTA_dict[record]
131
+ final_adata.uns[f'{record}_FASTA_sequence'] = sequence
132
+ final_adata.var[f'{record}_FASTA_sequence'] = list(sequence)
133
+ record_subset = final_adata[final_adata.obs['Reference'] == record].copy()
134
+ layer_map, layer_counts = {}, []
135
+ for i, layer in enumerate(record_subset.layers):
136
+ layer_map[i] = layer.split('_')[0]
137
+ layer_counts.append(np.sum(record_subset.layers[layer], axis=0))
138
+ count_array = np.array(layer_counts)
139
+ nucleotide_indexes = np.argmax(count_array, axis=0)
140
+ consensus_sequence_list = [layer_map[i] for i in nucleotide_indexes]
141
+ final_adata.var[f'{record}_consensus_across_samples'] = consensus_sequence_list
142
+
143
+ ######################################################################################################
144
+
145
+ ######################################################################################################
146
+ ## Export the final adata object
147
+ final_adata.write_h5ad('{0}_{1}.h5ad.gz'.format(readwrite.date_string(), experiment_name), compression='gzip')
@@ -0,0 +1,32 @@
1
+ ## count_aligned_reads
2
+ from .. import readwrite
3
+ # bioinformatic operations
4
+ import pysam
5
+
6
+ # General
7
+ def count_aligned_reads(bam_file):
8
+ """
9
+ Input: A BAM alignment file.
10
+ Output: The number of aligned/unaligned reads in the BAM file. Also returns a dictionary, keyed by reference id that points to a tuple. The tuple contains an integer number of mapped reads to that reference, followed by the proportion of mapped reads that map to that reference
11
+ """
12
+ print('{0}: Counting aligned reads in BAM > {1}'.format(readwrite.time_string(), bam_file))
13
+ aligned_reads_count = 0
14
+ unaligned_reads_count = 0
15
+ # Make a dictionary, keyed by the reference_name of reference chromosome that points to an integer number of read counts mapped to the chromosome, as well as the proportion of mapped reads in that chromosome
16
+ record_counts = {}
17
+ with pysam.AlignmentFile(bam_file, "rb") as bam:
18
+ # Iterate over reads to get the total mapped read counts and the reads that map to each reference
19
+ for read in bam:
20
+ if read.is_unmapped:
21
+ unaligned_reads_count += 1
22
+ else:
23
+ aligned_reads_count += 1
24
+ if read.reference_name in record_counts:
25
+ record_counts[read.reference_name] += 1
26
+ else:
27
+ record_counts[read.reference_name] = 1
28
+ # reformat the dictionary to contain read counts mapped to the reference, as well as the proportion of mapped reads in reference
29
+ for reference in record_counts:
30
+ proportion_mapped_reads_in_record = record_counts[reference] / aligned_reads_count
31
+ record_counts[reference] = (record_counts[reference], proportion_mapped_reads_in_record)
32
+ return aligned_reads_count, unaligned_reads_count, record_counts
@@ -0,0 +1,36 @@
1
+ ## extract_base_identities
2
+ from .. import readwrite
3
+ # bioinformatic operations
4
+ import pysam
5
+
6
+ # General
7
+ def extract_base_identities(bam_file, chromosome, positions, max_reference_length):
8
+ """
9
+ Input: A position sorted BAM file, chromosome number, position coordinate set, and reference length to extract the base identitity from the read.
10
+ Output: A dictionary, keyed by read name, that points to a list of Base identities from each read.
11
+ If the read does not contain that position, fill the list at that index with a N value.
12
+ """
13
+ positions = set(positions)
14
+ # Initialize a base identity dictionary that will hold key-value pairs that are: key (read-name) and value (list of base identities at positions of interest)
15
+ base_identities = {}
16
+ # Open the postion sorted BAM file
17
+ print('{0}: Reading BAM file: {1}'.format(readwrite.time_string(), bam_file))
18
+ with pysam.AlignmentFile(bam_file, "rb") as bam:
19
+ # Iterate over every read in the bam that comes from the chromosome of interest
20
+ print('{0}: Iterating over reads in bam'.format(readwrite.time_string()))
21
+ for read in bam.fetch(chromosome):
22
+ if read.query_name in base_identities:
23
+ pass
24
+ #print('Duplicate read found in BAM for read {}. Skipping duplicate'.format(read.query_name))
25
+ else:
26
+ # Initialize the read key in the base_identities dictionary by pointing to a N filled list of length reference_length
27
+ base_identities[read.query_name] = ['N'] * max_reference_length
28
+ # Iterate over a list of tuples for the given read. The tuples contain the 0-indexed position relative to the read start, as well the 0-based index relative to the reference.
29
+ for read_position, reference_position in read.get_aligned_pairs():
30
+ # If the aligned read's reference coordinate is in the positions set and if the read position was successfully mapped
31
+ if reference_position in positions and read_position:
32
+ # get the base_identity in the read corresponding to that position
33
+ base_identity = read.query_sequence[read_position]
34
+ # Add the base identity to array
35
+ base_identities[read.query_name][reference_position] = base_identity
36
+ return base_identities
@@ -0,0 +1,39 @@
1
+ ## extract_mods
2
+ import os
3
+ import subprocess
4
+ import glob
5
+ import zipfile
6
+
7
+ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix):
8
+ """
9
+ Takes all of the aligned, sorted, split modified BAM files and runs Nanopore Modkit Extract to load the modification data into zipped TSV files
10
+ """
11
+ os.chdir(mod_tsv_dir)
12
+ filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
13
+ bam_files = glob.glob(os.path.join(split_dir, f"*{bam_suffix}"))
14
+ for input_file in bam_files:
15
+ print(input_file)
16
+ # Extract the file basename
17
+ file_name = os.path.basename(input_file)
18
+ # Construct the output TSV file path
19
+ output_tsv_temp = os.path.join(mod_tsv_dir, file_name)
20
+ output_tsv = output_tsv_temp.replace(bam_suffix, "") + "_extract.tsv"
21
+ # Run modkit summary
22
+ subprocess.run(["modkit", "summary", input_file])
23
+ # Run modkit extract
24
+ subprocess.run([
25
+ "modkit", "extract",
26
+ "--filter-threshold", filter_threshold,
27
+ "--mod-thresholds", f"m:{m5C_threshold}",
28
+ "--mod-thresholds", f"a:{m6A_threshold}",
29
+ "--mod-thresholds", f"h:{hm5C_threshold}",
30
+ input_file, "null",
31
+ "--read-calls", output_tsv
32
+ ])
33
+ # Zip the output TSV
34
+ print(f'zipping {output_tsv}')
35
+ with zipfile.ZipFile(f"{output_tsv}.zip", 'w', zipfile.ZIP_DEFLATED) as zipf:
36
+ zipf.write(output_tsv, os.path.basename(output_tsv))
37
+ # Remove the non-zipped TSV
38
+ print(f'removing {output_tsv}')
39
+ os.remove(output_tsv)
@@ -0,0 +1,53 @@
1
+ ## find_conversion_sites
2
+ from .. import readwrite
3
+ # bioinformatic operations
4
+ from Bio import SeqIO
5
+ from Bio.SeqRecord import SeqRecord
6
+ from Bio.Seq import Seq
7
+
8
+ def find_conversion_sites(fasta_file, modification_type):
9
+ """
10
+ A function to find genomic coordinates in every unconverted record contained within a FASTA file of every cytosine.
11
+ If searching for adenine conversions, it will find coordinates of all adenines.
12
+ Input: A FASTA file and the modification_types of interest
13
+ Returns:
14
+ A dictionary called record_dict, which is keyed by unconverted record ids contained within the FASTA. Points to a list containing: 1) sequence length of the record, 2) top strand coordinate list, 3) bottom strand coorinate list, 4) sequence string
15
+ """
16
+ print('{0}: Finding positions of interest in reference FASTA > {1}'.format(readwrite.time_string(), fasta_file))
17
+ # Initialize lists to hold top and bottom strand positional coordinates of interest
18
+ top_strand_coordinates = []
19
+ bottom_strand_coordinates = []
20
+ record_dict = {}
21
+ print('{0}: Opening FASTA file {1}'.format(readwrite.time_string(), fasta_file))
22
+ # Open the FASTA record as read only
23
+ with open(fasta_file, "r") as f:
24
+ # Iterate over records in the FASTA
25
+ for record in SeqIO.parse(f, "fasta"):
26
+ # Only iterate over the unconverted records for the reference
27
+ if 'unconverted' in record.id:
28
+ print('{0}: Iterating over record {1} in FASTA file {2}'.format(readwrite.time_string(), record, fasta_file))
29
+ # Extract the sequence string of the record
30
+ sequence = str(record.seq).upper()
31
+ sequence_length = len(sequence)
32
+ if modification_type == '5mC':
33
+ # Iterate over the sequence string from the record
34
+ for i in range(0, len(sequence)):
35
+ if sequence[i] == 'C':
36
+ top_strand_coordinates.append(i) # 0-indexed coordinate
37
+ if sequence[i] == 'G':
38
+ bottom_strand_coordinates.append(i) # 0-indexed coordinate
39
+ print('{0}: Returning zero-indexed top and bottom strand FASTA coordinates for all cytosines'.format(readwrite.time_string()))
40
+ elif modification_type == '6mA':
41
+ # Iterate over the sequence string from the record
42
+ for i in range(0, len(sequence)):
43
+ if sequence[i] == 'A':
44
+ top_strand_coordinates.append(i) # 0-indexed coordinate
45
+ if sequence[i] == 'T':
46
+ bottom_strand_coordinates.append(i) # 0-indexed coordinate
47
+ print('{0}: Returning zero-indexed top and bottom strand FASTA coordinates for adenines of interest'.format(readwrite.time_string()))
48
+ else:
49
+ print('modification_type not found. Please try 5mC or 6mA')
50
+ record_dict[record.id] = [sequence_length, top_strand_coordinates, bottom_strand_coordinates, sequence]
51
+ else:
52
+ pass
53
+ return record_dict
@@ -0,0 +1,59 @@
1
+ ## generate_converted_FASTA
2
+ from .. import readwrite
3
+ # bioinformatic operations
4
+ from Bio import SeqIO
5
+ from Bio.SeqRecord import SeqRecord
6
+ from Bio.Seq import Seq
7
+
8
+ def convert_FASTA_record(record, modification_type, strand):
9
+ """
10
+ Input: Takes a FASTA record, modification type, and strand as input
11
+ Output: Returns a new seqrecord object with the conversions of interest
12
+ """
13
+ if modification_type == '5mC':
14
+ if strand == 'top':
15
+ # Replace every 'C' with 'T' in the sequence
16
+ new_seq = record.seq.upper().replace('C', 'T')
17
+ elif strand == 'bottom':
18
+ # Replace every 'G' with 'A' in the sequence
19
+ new_seq = record.seq.upper().replace('G', 'A')
20
+ else:
21
+ print('need to provide a valid strand string: top or bottom')
22
+ elif modification_type == '6mA':
23
+ if strand == 'top':
24
+ # Replace every 'A' with 'G' in the sequence
25
+ new_seq = record.seq.upper().replace('A', 'G')
26
+ elif strand == 'bottom':
27
+ # Replace every 'T' with 'C' in the sequence
28
+ new_seq = record.seq.upper().replace('T', 'C')
29
+ else:
30
+ print('need to provide a valid strand string: top or bottom')
31
+ elif modification_type == 'unconverted':
32
+ new_seq = record.seq.upper()
33
+ else:
34
+ print('need to provide a valid modification_type string: 5mC, 6mA, or unconverted')
35
+ new_id = '{0}_{1}_{2}'.format(record.id, modification_type, strand)
36
+ # Return a new SeqRecord with modified sequence and ID
37
+
38
+ def generate_converted_FASTA(input_fasta, modification_types, strands, output_fasta):
39
+ """
40
+ Input: Takes an input FASTA, modification types of interest, strands of interest, and an output FASTA name
41
+ Output: Writes out a new fasta with all stranded conversions
42
+ Notes: Uses modify_sequence_and_id function on every record within the FASTA
43
+ """
44
+ with open(output_fasta, 'w') as output_handle:
45
+ modified_records = []
46
+ # Iterate over each record in the input FASTA
47
+ for record in SeqIO.parse(input_fasta, 'fasta'):
48
+ # Iterate over each modification type of interest
49
+ for modification_type in modification_types:
50
+ # Iterate over the strands of interest
51
+ for i, strand in enumerate(strands):
52
+ if i > 0 and modification_type == 'unconverted': # This ensures that the unconverted only is added once and takes on the strand that is provided at the 0 index on strands.
53
+ pass
54
+ else:
55
+ # Add the modified record to the list of modified records
56
+ print(f'converting {modification_type} on the {strand} strand of record {record}')
57
+ modified_records.append(convert_FASTA_record(record, modification_type, strand))
58
+ # write out the concatenated FASTA file of modified sequences
59
+ SeqIO.write(modified_records, output_handle, 'fasta')
@@ -0,0 +1,25 @@
1
+ ## get_native_references
2
+ from .. import readwrite
3
+ # bioinformatic operations
4
+ from Bio import SeqIO
5
+ from Bio.SeqRecord import SeqRecord
6
+ from Bio.Seq import Seq
7
+
8
+ # Direct methylation specific
9
+ def get_native_references(fasta_file):
10
+ """
11
+ Input: A FASTA file
12
+ Returns:
13
+ A dictionary called record_dict, which is keyed by record ids contained within the FASTA. Points to a list containing: 1) sequence length of the record, 2) sequence of the record
14
+ """
15
+ record_dict = {}
16
+ print('{0}: Opening FASTA file {1}'.format(readwrite.time_string(), fasta_file))
17
+ # Open the FASTA record as read only
18
+ with open(fasta_file, "r") as f:
19
+ # Iterate over records in the FASTA
20
+ for record in SeqIO.parse(f, "fasta"):
21
+ # Extract the sequence string of the record
22
+ sequence = str(record.seq).upper()
23
+ sequence_length = len(sequence)
24
+ record_dict[record.id] = [sequence_length, sequence]
25
+ return record_dict