smftools 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +0 -2
- smftools/_settings.py +1 -1
- smftools/_version.py +1 -0
- smftools/datasets/datasets.py +11 -9
- smftools/informatics/__init__.py +8 -7
- smftools/informatics/bam_conversion.py +47 -0
- smftools/informatics/bam_direct.py +49 -0
- smftools/informatics/basecalls_to_adata.py +42 -0
- smftools/informatics/fast5_to_pod5.py +19 -0
- smftools/informatics/helpers/LoadExperimentConfig.py +74 -0
- smftools/informatics/helpers/__init__.py +4 -4
- smftools/informatics/helpers/align_and_sort_BAM.py +52 -0
- smftools/informatics/helpers/binarize_converted_base_identities.py +10 -3
- smftools/informatics/helpers/canoncall.py +12 -1
- smftools/informatics/helpers/converted_BAM_to_adata.py +30 -13
- smftools/informatics/helpers/count_aligned_reads.py +12 -5
- smftools/informatics/helpers/extract_base_identities.py +13 -6
- smftools/informatics/helpers/extract_mods.py +17 -5
- smftools/informatics/helpers/find_conversion_sites.py +15 -9
- smftools/informatics/helpers/generate_converted_FASTA.py +49 -29
- smftools/informatics/helpers/get_native_references.py +10 -7
- smftools/informatics/helpers/make_dirs.py +9 -3
- smftools/informatics/helpers/make_modbed.py +10 -4
- smftools/informatics/helpers/modQC.py +10 -2
- smftools/informatics/helpers/modcall.py +13 -1
- smftools/informatics/helpers/modkit_extract_to_adata.py +25 -13
- smftools/informatics/helpers/one_hot_encode.py +8 -3
- smftools/informatics/helpers/separate_bam_by_bc.py +18 -5
- smftools/informatics/helpers/split_and_index_BAM.py +18 -10
- smftools/informatics/pod5_conversion.py +34 -7
- smftools/informatics/pod5_direct.py +31 -5
- smftools/informatics/pod5_to_adata.py +31 -8
- smftools/informatics/readwrite.py +13 -16
- smftools/informatics/subsample_pod5.py +48 -0
- smftools/preprocessing/__init__.py +0 -6
- smftools/preprocessing/append_C_context.py +15 -8
- smftools/preprocessing/binarize_on_Youden.py +8 -4
- smftools/preprocessing/binary_layers_to_ohe.py +9 -4
- smftools/preprocessing/calculate_complexity.py +26 -14
- smftools/preprocessing/calculate_converted_read_methylation_stats.py +12 -5
- smftools/preprocessing/calculate_coverage.py +13 -7
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +11 -6
- smftools/preprocessing/calculate_position_Youden.py +21 -12
- smftools/preprocessing/calculate_read_length_stats.py +11 -6
- smftools/preprocessing/clean_NaN.py +12 -5
- smftools/preprocessing/filter_converted_reads_on_methylation.py +12 -5
- smftools/preprocessing/filter_reads_on_length.py +13 -5
- smftools/preprocessing/invert_adata.py +9 -5
- smftools/preprocessing/mark_duplicates.py +20 -11
- smftools/preprocessing/min_non_diagonal.py +9 -4
- smftools/preprocessing/remove_duplicates.py +9 -3
- smftools/readwrite.py +13 -16
- smftools-0.1.1.dist-info/METADATA +88 -0
- smftools-0.1.1.dist-info/RECORD +64 -0
- smftools/informatics/helpers/align_BAM.py +0 -49
- smftools/informatics/helpers/load_experiment_config.py +0 -17
- smftools-0.1.0.dist-info/METADATA +0 -75
- smftools-0.1.0.dist-info/RECORD +0 -58
- /smftools/informatics/helpers/{informatics.py → archived/informatics.py} +0 -0
- /smftools/informatics/helpers/{load_adata.py → archived/load_adata.py} +0 -0
- /smftools/preprocessing/{preprocessing.py → archives/preprocessing.py} +0 -0
- {smftools-0.1.0.dist-info → smftools-0.1.1.dist-info}/WHEEL +0 -0
- {smftools-0.1.0.dist-info → smftools-0.1.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,22 +1,28 @@
|
|
|
1
1
|
## find_conversion_sites
|
|
2
|
-
from .. import readwrite
|
|
3
|
-
# bioinformatic operations
|
|
4
|
-
from Bio import SeqIO
|
|
5
|
-
from Bio.SeqRecord import SeqRecord
|
|
6
|
-
from Bio.Seq import Seq
|
|
7
2
|
|
|
8
|
-
def find_conversion_sites(fasta_file, modification_type):
|
|
3
|
+
def find_conversion_sites(fasta_file, modification_type, conversion_types):
|
|
9
4
|
"""
|
|
10
5
|
A function to find genomic coordinates in every unconverted record contained within a FASTA file of every cytosine.
|
|
11
6
|
If searching for adenine conversions, it will find coordinates of all adenines.
|
|
12
|
-
|
|
7
|
+
|
|
8
|
+
Parameters:
|
|
9
|
+
fasta_file (str): A string representing the file path to the unconverted reference FASTA.
|
|
10
|
+
modification_type (str): A string representing the modification type of interest (options are '5mC' and '6mA').
|
|
11
|
+
conversion_types (list): A list of strings of the conversion types to use in the analysis. Used here to pass the unconverted record name.
|
|
12
|
+
|
|
13
13
|
Returns:
|
|
14
|
-
|
|
14
|
+
record_dict (dict): A dictionary keyed by unconverted record ids contained within the FASTA. Points to a list containing: 1) sequence length of the record, 2) top strand coordinate list, 3) bottom strand coorinate list, 4) sequence string
|
|
15
15
|
"""
|
|
16
|
+
from .. import readwrite
|
|
17
|
+
from Bio import SeqIO
|
|
18
|
+
from Bio.SeqRecord import SeqRecord
|
|
19
|
+
from Bio.Seq import Seq
|
|
20
|
+
|
|
16
21
|
print('{0}: Finding positions of interest in reference FASTA > {1}'.format(readwrite.time_string(), fasta_file))
|
|
17
22
|
# Initialize lists to hold top and bottom strand positional coordinates of interest
|
|
18
23
|
top_strand_coordinates = []
|
|
19
24
|
bottom_strand_coordinates = []
|
|
25
|
+
unconverted = conversion_types[0]
|
|
20
26
|
record_dict = {}
|
|
21
27
|
print('{0}: Opening FASTA file {1}'.format(readwrite.time_string(), fasta_file))
|
|
22
28
|
# Open the FASTA record as read only
|
|
@@ -24,7 +30,7 @@ def find_conversion_sites(fasta_file, modification_type):
|
|
|
24
30
|
# Iterate over records in the FASTA
|
|
25
31
|
for record in SeqIO.parse(f, "fasta"):
|
|
26
32
|
# Only iterate over the unconverted records for the reference
|
|
27
|
-
if
|
|
33
|
+
if unconverted in record.id:
|
|
28
34
|
print('{0}: Iterating over record {1} in FASTA file {2}'.format(readwrite.time_string(), record, fasta_file))
|
|
29
35
|
# Extract the sequence string of the record
|
|
30
36
|
sequence = str(record.seq).upper()
|
|
@@ -1,14 +1,16 @@
|
|
|
1
1
|
## generate_converted_FASTA
|
|
2
|
-
from .. import readwrite
|
|
3
|
-
# bioinformatic operations
|
|
4
|
-
from Bio import SeqIO
|
|
5
|
-
from Bio.SeqRecord import SeqRecord
|
|
6
|
-
from Bio.Seq import Seq
|
|
7
2
|
|
|
8
|
-
def convert_FASTA_record(record, modification_type, strand):
|
|
3
|
+
def convert_FASTA_record(record, modification_type, strand, unconverted):
|
|
9
4
|
"""
|
|
10
|
-
|
|
11
|
-
|
|
5
|
+
Takes a FASTA record and converts every instance of a base to the converted state.
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
record (str): The name of the record instance within the FASTA.
|
|
9
|
+
modification_type (str): The modification type to convert for (options are '5mC' and '6mA').
|
|
10
|
+
strand (str): The strand that is being converted in the experiment (options are 'top' and 'bottom').
|
|
11
|
+
Returns:
|
|
12
|
+
new_seq (str): Converted sequence string.
|
|
13
|
+
new_id (str): Record id for the converted sequence string.
|
|
12
14
|
"""
|
|
13
15
|
if modification_type == '5mC':
|
|
14
16
|
if strand == 'top':
|
|
@@ -18,7 +20,8 @@ def convert_FASTA_record(record, modification_type, strand):
|
|
|
18
20
|
# Replace every 'G' with 'A' in the sequence
|
|
19
21
|
new_seq = record.seq.upper().replace('G', 'A')
|
|
20
22
|
else:
|
|
21
|
-
print('need to provide a valid strand string: top or bottom')
|
|
23
|
+
print('need to provide a valid strand string: top or bottom')
|
|
24
|
+
new_id = '{0}_{1}_{2}'.format(record.id, modification_type, strand)
|
|
22
25
|
elif modification_type == '6mA':
|
|
23
26
|
if strand == 'top':
|
|
24
27
|
# Replace every 'A' with 'G' in the sequence
|
|
@@ -28,32 +31,49 @@ def convert_FASTA_record(record, modification_type, strand):
|
|
|
28
31
|
new_seq = record.seq.upper().replace('T', 'C')
|
|
29
32
|
else:
|
|
30
33
|
print('need to provide a valid strand string: top or bottom')
|
|
31
|
-
|
|
34
|
+
new_id = '{0}_{1}_{2}'.format(record.id, modification_type, strand)
|
|
35
|
+
elif modification_type == unconverted:
|
|
32
36
|
new_seq = record.seq.upper()
|
|
37
|
+
new_id = '{0}_{1}_top'.format(record.id, modification_type)
|
|
33
38
|
else:
|
|
34
|
-
print('need to provide a valid modification_type string: 5mC, 6mA, or unconverted')
|
|
35
|
-
|
|
36
|
-
|
|
39
|
+
print(f'need to provide a valid modification_type string: 5mC, 6mA, or {unconverted}')
|
|
40
|
+
|
|
41
|
+
return new_seq, new_id
|
|
37
42
|
|
|
38
43
|
def generate_converted_FASTA(input_fasta, modification_types, strands, output_fasta):
|
|
39
44
|
"""
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
45
|
+
Uses modify_sequence_and_id function on every record within the FASTA to write out a converted FASTA.
|
|
46
|
+
|
|
47
|
+
Parameters:
|
|
48
|
+
input_FASTA (str): A string representing the path to the unconverted FASTA file.
|
|
49
|
+
modification_types (list): A list of modification types to use in the experiment.
|
|
50
|
+
strands (list): A list of converstion strands to use in the experiment.
|
|
51
|
+
output_FASTA (str): A string representing the path to the converted FASTA output file.
|
|
52
|
+
Returns:
|
|
53
|
+
None
|
|
54
|
+
Writes out a converted FASTA reference for the experiment.
|
|
43
55
|
"""
|
|
56
|
+
from .. import readwrite
|
|
57
|
+
from Bio import SeqIO
|
|
58
|
+
from Bio.SeqRecord import SeqRecord
|
|
59
|
+
from Bio.Seq import Seq
|
|
60
|
+
modified_records = []
|
|
61
|
+
unconverted = modification_types[0]
|
|
62
|
+
# Iterate over each record in the input FASTA
|
|
63
|
+
for record in SeqIO.parse(input_fasta, 'fasta'):
|
|
64
|
+
record_description = record.description
|
|
65
|
+
# Iterate over each modification type of interest
|
|
66
|
+
for modification_type in modification_types:
|
|
67
|
+
# Iterate over the strands of interest
|
|
68
|
+
for i, strand in enumerate(strands):
|
|
69
|
+
if i > 0 and modification_type == unconverted: # This ensures that the unconverted is only added once.
|
|
70
|
+
pass
|
|
71
|
+
else:
|
|
72
|
+
# Add the modified record to the list of modified records
|
|
73
|
+
print(f'converting {modification_type} on the {strand} strand of record {record}')
|
|
74
|
+
new_seq, new_id = convert_FASTA_record(record, modification_type, strand, unconverted)
|
|
75
|
+
new_record = SeqRecord(Seq(new_seq), id=new_id, description=record_description)
|
|
76
|
+
modified_records.append(new_record)
|
|
44
77
|
with open(output_fasta, 'w') as output_handle:
|
|
45
|
-
modified_records = []
|
|
46
|
-
# Iterate over each record in the input FASTA
|
|
47
|
-
for record in SeqIO.parse(input_fasta, 'fasta'):
|
|
48
|
-
# Iterate over each modification type of interest
|
|
49
|
-
for modification_type in modification_types:
|
|
50
|
-
# Iterate over the strands of interest
|
|
51
|
-
for i, strand in enumerate(strands):
|
|
52
|
-
if i > 0 and modification_type == 'unconverted': # This ensures that the unconverted only is added once and takes on the strand that is provided at the 0 index on strands.
|
|
53
|
-
pass
|
|
54
|
-
else:
|
|
55
|
-
# Add the modified record to the list of modified records
|
|
56
|
-
print(f'converting {modification_type} on the {strand} strand of record {record}')
|
|
57
|
-
modified_records.append(convert_FASTA_record(record, modification_type, strand))
|
|
58
78
|
# write out the concatenated FASTA file of modified sequences
|
|
59
79
|
SeqIO.write(modified_records, output_handle, 'fasta')
|
|
@@ -1,17 +1,20 @@
|
|
|
1
1
|
## get_native_references
|
|
2
|
-
from .. import readwrite
|
|
3
|
-
# bioinformatic operations
|
|
4
|
-
from Bio import SeqIO
|
|
5
|
-
from Bio.SeqRecord import SeqRecord
|
|
6
|
-
from Bio.Seq import Seq
|
|
7
2
|
|
|
8
3
|
# Direct methylation specific
|
|
9
4
|
def get_native_references(fasta_file):
|
|
10
5
|
"""
|
|
11
|
-
|
|
6
|
+
Makes a dictionary keyed by record id which points to the record length and record sequence.
|
|
7
|
+
|
|
8
|
+
Paramaters:
|
|
9
|
+
fasta_file (str): A string representing the path to the FASTA file for the experiment.
|
|
10
|
+
|
|
12
11
|
Returns:
|
|
13
|
-
|
|
12
|
+
None
|
|
14
13
|
"""
|
|
14
|
+
from .. import readwrite
|
|
15
|
+
from Bio import SeqIO
|
|
16
|
+
from Bio.SeqRecord import SeqRecord
|
|
17
|
+
from Bio.Seq import Seq
|
|
15
18
|
record_dict = {}
|
|
16
19
|
print('{0}: Opening FASTA file {1}'.format(readwrite.time_string(), fasta_file))
|
|
17
20
|
# Open the FASTA record as read only
|
|
@@ -1,12 +1,18 @@
|
|
|
1
1
|
## make_dirs
|
|
2
|
-
import os
|
|
3
2
|
|
|
4
3
|
# General
|
|
5
4
|
def make_dirs(directories):
|
|
6
5
|
"""
|
|
7
|
-
|
|
8
|
-
|
|
6
|
+
Takes a list of file paths and makes new directories if the directory does not already exist.
|
|
7
|
+
|
|
8
|
+
Parameters:
|
|
9
|
+
directories (list): A list of directories to make
|
|
10
|
+
|
|
11
|
+
Returns:
|
|
12
|
+
None
|
|
9
13
|
"""
|
|
14
|
+
import os
|
|
15
|
+
|
|
10
16
|
for directory in directories:
|
|
11
17
|
if not os.path.isdir(directory):
|
|
12
18
|
os.mkdir(directory)
|
|
@@ -1,19 +1,25 @@
|
|
|
1
1
|
## make_modbed
|
|
2
|
-
import os
|
|
3
|
-
import subprocess
|
|
4
2
|
|
|
5
3
|
# Direct SMF
|
|
6
4
|
def make_modbed(aligned_sorted_output, thresholds, mod_bed_dir):
|
|
7
5
|
"""
|
|
8
|
-
Generating
|
|
6
|
+
Generating position methylation summaries for each barcoded sample starting from the overall BAM file that was direct output of dorado aligner.
|
|
7
|
+
Parameters:
|
|
8
|
+
aligned_sorted_output (str): A string representing the file path to the aligned_sorted non-split BAM file.
|
|
9
|
+
|
|
10
|
+
Returns:
|
|
11
|
+
None
|
|
9
12
|
"""
|
|
13
|
+
import os
|
|
14
|
+
import subprocess
|
|
15
|
+
|
|
10
16
|
os.chdir(mod_bed_dir)
|
|
11
17
|
filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
|
|
12
18
|
command = [
|
|
13
19
|
"modkit", "pileup", aligned_sorted_output, mod_bed_dir,
|
|
14
20
|
"--partition-tag", "BC",
|
|
15
21
|
"--only-tabs",
|
|
16
|
-
"--filter-threshold", filter_threshold,
|
|
22
|
+
"--filter-threshold", f'{filter_threshold}',
|
|
17
23
|
"--mod-thresholds", f"m:{m5C_threshold}",
|
|
18
24
|
"--mod-thresholds", f"a:{m6A_threshold}",
|
|
19
25
|
"--mod-thresholds", f"h:{hm5C_threshold}"
|
|
@@ -1,17 +1,25 @@
|
|
|
1
1
|
## modQC
|
|
2
|
-
import subprocess
|
|
3
2
|
|
|
4
3
|
# Direct SMF
|
|
5
4
|
def modQC(aligned_sorted_output, thresholds):
|
|
6
5
|
"""
|
|
7
6
|
Output the percentile of bases falling at a call threshold (threshold is a probability between 0-1) for the overall BAM file.
|
|
8
7
|
It is generally good to look at these parameters on positive and negative controls.
|
|
8
|
+
|
|
9
|
+
Parameters:
|
|
10
|
+
aligned_sorted_output (str): A string representing the file path of the aligned_sorted non-split BAM file output by the dorado aligned.
|
|
11
|
+
thresholds (list): A list of floats to pass for call thresholds.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
None
|
|
9
15
|
"""
|
|
16
|
+
import subprocess
|
|
17
|
+
|
|
10
18
|
filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
|
|
11
19
|
subprocess.run(["modkit", "sample-probs", aligned_sorted_output])
|
|
12
20
|
command = [
|
|
13
21
|
"modkit", "summary", aligned_sorted_output,
|
|
14
|
-
"--filter-threshold", filter_threshold,
|
|
22
|
+
"--filter-threshold", f"{filter_threshold}",
|
|
15
23
|
"--mod-thresholds", f"m:{m5C_threshold}",
|
|
16
24
|
"--mod-thresholds", f"a:{m6A_threshold}",
|
|
17
25
|
"--mod-thresholds", f"h:{hm5C_threshold}"
|
|
@@ -1,11 +1,23 @@
|
|
|
1
1
|
## modcall
|
|
2
|
-
import subprocess
|
|
3
2
|
|
|
4
3
|
# Direct methylation specific
|
|
5
4
|
def modcall(model, pod5_dir, barcode_kit, mod_list, bam, bam_suffix):
|
|
6
5
|
"""
|
|
7
6
|
Wrapper function for dorado modified base calling.
|
|
7
|
+
|
|
8
|
+
Parameters:
|
|
9
|
+
model (str): a string representing the file path to the dorado basecalling model.
|
|
10
|
+
pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
|
|
11
|
+
barcode_kit (str): A string representing the barcoding kit used in the experiment.
|
|
12
|
+
mod_list (list): A list of modification types to use in the analysis.
|
|
13
|
+
bam (str): File path to the BAM file to output.
|
|
14
|
+
bam_suffix (str): The suffix to use for the BAM file.
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
None
|
|
18
|
+
Outputs a BAM file holding the modified base calls output by the dorado basecaller.
|
|
8
19
|
"""
|
|
20
|
+
import subprocess
|
|
9
21
|
output = bam + bam_suffix
|
|
10
22
|
command = [
|
|
11
23
|
"dorado", "basecaller", model, pod5_dir, "--kit-name", barcode_kit, "-Y",
|
|
@@ -1,20 +1,31 @@
|
|
|
1
1
|
## modkit_extract_to_adata
|
|
2
|
-
from .. import readwrite
|
|
3
|
-
from .get_native_references import get_native_references
|
|
4
|
-
from .count_aligned_reads import count_aligned_reads
|
|
5
|
-
from .extract_base_identities import extract_base_identities
|
|
6
|
-
from .one_hot_encode import one_hot_encode
|
|
7
|
-
import pandas as pd
|
|
8
|
-
import anndata as ad
|
|
9
|
-
import os
|
|
10
|
-
import gc
|
|
11
|
-
import math
|
|
12
|
-
import numpy as np
|
|
13
2
|
|
|
14
3
|
def modkit_extract_to_adata(fasta, bam, mapping_threshold, experiment_name, mods, batch_size):
|
|
15
4
|
"""
|
|
16
|
-
|
|
5
|
+
Takes modkit extract outputs and organizes it into an adata object
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
fasta (str): File path to the reference genome to align to.
|
|
9
|
+
bam (str): File path to the aligned_sorted non-split modified BAM file
|
|
10
|
+
mapping_threshold (float): A value in between 0 and 1 to threshold the minimal fraction of aligned reads which map to the reference region. References with values above the threshold are included in the output adata.
|
|
11
|
+
experiment_name (str): A string to provide an experiment name to the output adata file.
|
|
12
|
+
mods (list): A list of strings of the modification types to use in the analysis.
|
|
13
|
+
batch_size (int): An integer number of TSV files to analyze in memory at once while loading the final adata object.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
None
|
|
17
17
|
"""
|
|
18
|
+
from .. import readwrite
|
|
19
|
+
from .get_native_references import get_native_references
|
|
20
|
+
from .count_aligned_reads import count_aligned_reads
|
|
21
|
+
from .extract_base_identities import extract_base_identities
|
|
22
|
+
from .one_hot_encode import one_hot_encode
|
|
23
|
+
import pandas as pd
|
|
24
|
+
import anndata as ad
|
|
25
|
+
import os
|
|
26
|
+
import gc
|
|
27
|
+
import math
|
|
28
|
+
import numpy as np
|
|
18
29
|
###################################################
|
|
19
30
|
### Get input tsv file names into a sorted list ###
|
|
20
31
|
# List all files in the directory
|
|
@@ -56,7 +67,8 @@ def modkit_extract_to_adata(fasta, bam, mapping_threshold, experiment_name, mods
|
|
|
56
67
|
delta_max_length = max_reference_length - current_reference_length
|
|
57
68
|
sequence = reference_dict[record][1] + 'N'*delta_max_length
|
|
58
69
|
# Get a dictionary of positional base identities keyed by read id
|
|
59
|
-
|
|
70
|
+
positions = range(current_reference_length)
|
|
71
|
+
base_identities = extract_base_identities(bam, record, positions, max_reference_length)
|
|
60
72
|
# One hot encode the sequence string of the reads
|
|
61
73
|
one_hot_reads = {read_name: one_hot_encode(seq) for read_name, seq in base_identities.items()}
|
|
62
74
|
record_seq_dict[record] = (one_hot_reads, sequence)
|
|
@@ -1,12 +1,17 @@
|
|
|
1
1
|
# one_hot_encode
|
|
2
|
-
from .. import readwrite
|
|
3
2
|
|
|
4
3
|
# String encodings
|
|
5
4
|
def one_hot_encode(sequence):
|
|
6
5
|
"""
|
|
7
|
-
|
|
8
|
-
|
|
6
|
+
One hot encodes a sequence string.
|
|
7
|
+
Parameters:
|
|
8
|
+
sequence (str): A DNA sequence string.
|
|
9
|
+
|
|
10
|
+
Returns:
|
|
11
|
+
one_hot_matrix (ndarray): A numpy ndarray holding a vstacked one hot encoding of the input sequence string.
|
|
9
12
|
"""
|
|
13
|
+
import numpy as np
|
|
14
|
+
|
|
10
15
|
mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3, 'N': 4}
|
|
11
16
|
one_hot_matrix = np.zeros((len(sequence), 5), dtype=int)
|
|
12
17
|
for i, nucleotide in enumerate(sequence):
|
|
@@ -1,12 +1,25 @@
|
|
|
1
1
|
## separate_bam_by_bc
|
|
2
|
-
import pysam
|
|
3
2
|
|
|
4
3
|
# General
|
|
5
|
-
def separate_bam_by_bc(input_bam, output_prefix):
|
|
4
|
+
def separate_bam_by_bc(input_bam, output_prefix, bam_suffix):
|
|
6
5
|
"""
|
|
7
|
-
|
|
8
|
-
|
|
6
|
+
Separates an input BAM file on the BC SAM tag values.
|
|
7
|
+
|
|
8
|
+
Parameters:
|
|
9
|
+
input_bam (str): File path to the BAM file to split.
|
|
10
|
+
output_prefix (str): A prefix to append to the output BAM.
|
|
11
|
+
bam_suffix (str): A suffix to add to the bam file.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
None
|
|
15
|
+
Writes out split BAM files.
|
|
9
16
|
"""
|
|
17
|
+
import pysam
|
|
18
|
+
import os
|
|
19
|
+
|
|
20
|
+
bam_base = os.path.basename(input_bam)
|
|
21
|
+
bam_base_minus_suffix = bam_base.split(bam_suffix)[0]
|
|
22
|
+
|
|
10
23
|
# Open the input BAM file for reading
|
|
11
24
|
with pysam.AlignmentFile(input_bam, "rb") as bam:
|
|
12
25
|
# Create a dictionary to store output BAM files
|
|
@@ -18,7 +31,7 @@ def separate_bam_by_bc(input_bam, output_prefix):
|
|
|
18
31
|
bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
|
|
19
32
|
# Open the output BAM file corresponding to the barcode
|
|
20
33
|
if bc_tag not in output_files:
|
|
21
|
-
output_files[bc_tag] = pysam.AlignmentFile(f"{output_prefix}_{bc_tag}
|
|
34
|
+
output_files[bc_tag] = pysam.AlignmentFile(f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}", "wb", header=bam.header)
|
|
22
35
|
# Write the read to the corresponding output BAM file
|
|
23
36
|
output_files[bc_tag].write(read)
|
|
24
37
|
except KeyError:
|
|
@@ -1,21 +1,29 @@
|
|
|
1
1
|
## split_and_index_BAM
|
|
2
|
-
from .. import readwrite
|
|
3
|
-
import os
|
|
4
|
-
import subprocess
|
|
5
|
-
import glob
|
|
6
|
-
from .separate_bam_by_bc import separate_bam_by_bc
|
|
7
2
|
|
|
8
3
|
def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
|
|
9
4
|
"""
|
|
10
|
-
A wrapper function for splitting BAMS and indexing them
|
|
5
|
+
A wrapper function for splitting BAMS and indexing them.
|
|
6
|
+
Parameters:
|
|
7
|
+
aligned_sorted_BAM (str): A string representing the file path of the aligned_sorted BAM file.
|
|
8
|
+
split_dir (str): A string representing the file path to the directory to split the BAMs into.
|
|
9
|
+
bam_suffix (str): A suffix to add to the bam file.
|
|
10
|
+
|
|
11
|
+
Returns:
|
|
12
|
+
None
|
|
13
|
+
Splits an input BAM file on barcode value and makes a BAM index file.
|
|
11
14
|
"""
|
|
15
|
+
from .. import readwrite
|
|
16
|
+
import os
|
|
17
|
+
import subprocess
|
|
18
|
+
import glob
|
|
19
|
+
from .separate_bam_by_bc import separate_bam_by_bc
|
|
20
|
+
|
|
12
21
|
os.chdir(split_dir)
|
|
13
22
|
aligned_sorted_output = aligned_sorted_BAM + bam_suffix
|
|
14
|
-
file_prefix = readwrite.
|
|
15
|
-
separate_bam_by_bc(aligned_sorted_output, file_prefix)
|
|
23
|
+
file_prefix = readwrite.date_string()
|
|
24
|
+
separate_bam_by_bc(aligned_sorted_output, file_prefix, bam_suffix)
|
|
16
25
|
# Make a BAM index file for the BAMs in that directory
|
|
17
26
|
bam_pattern = '*' + bam_suffix
|
|
18
27
|
bam_files = glob.glob(os.path.join(split_dir, bam_pattern))
|
|
19
28
|
for input_file in bam_files:
|
|
20
|
-
subprocess.run(["samtools", "index", input_file])
|
|
21
|
-
print(f"Indexed {input_file}")
|
|
29
|
+
subprocess.run(["samtools", "index", input_file])
|
|
@@ -1,23 +1,50 @@
|
|
|
1
1
|
## pod5_conversion
|
|
2
|
-
from .helpers import align_BAM, canoncall, converted_BAM_to_adata, generate_converted_FASTA, split_and_index_BAM
|
|
3
|
-
import subprocess
|
|
4
2
|
|
|
5
3
|
def pod5_conversion(fasta, output_directory, conversion_types, strands, model, pod5_dir, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix):
|
|
6
4
|
"""
|
|
7
|
-
Converts a POD5 file from a nanopore conversion SMF experiment to an adata object
|
|
5
|
+
Converts a POD5 file from a nanopore conversion SMF experiment to an adata object.
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
fasta (str): File path to the reference genome to align to.
|
|
9
|
+
output_directory (str): A file path to the directory to output all the analyses.
|
|
10
|
+
conversion_type (list): A list of strings of the conversion types to use in the analysis.
|
|
11
|
+
strands (list): A list of converstion strands to use in the experiment.
|
|
12
|
+
model (str): a string representing the file path to the dorado basecalling model.
|
|
13
|
+
pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
|
|
14
|
+
split_dir (str): A string representing the file path to the directory to split the BAMs into.
|
|
15
|
+
barcode_kit (str): A string representing the barcoding kit used in the experiment.
|
|
16
|
+
mapping_threshold (float): A value in between 0 and 1 to threshold the minimal fraction of aligned reads which map to the reference region. References with values above the threshold are included in the output adata.
|
|
17
|
+
experiment_name (str): A string to provide an experiment name to the output adata file.
|
|
18
|
+
bam_suffix (str): A suffix to add to the bam file.
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
None
|
|
8
22
|
"""
|
|
9
|
-
|
|
23
|
+
from .helpers import align_and_sort_BAM, canoncall, converted_BAM_to_adata, generate_converted_FASTA, split_and_index_BAM
|
|
24
|
+
import os
|
|
25
|
+
model_basename = os.path.basename(model)
|
|
26
|
+
model_basename = model_basename.replace('.', '_')
|
|
27
|
+
bam=f"{output_directory}/{model_basename}_canonical_basecalls"
|
|
10
28
|
aligned_BAM=f"{bam}_aligned"
|
|
11
29
|
aligned_sorted_BAM=f"{aligned_BAM}_sorted"
|
|
30
|
+
|
|
31
|
+
os.chdir(output_directory)
|
|
32
|
+
|
|
12
33
|
# 1) Convert FASTA file
|
|
13
|
-
|
|
14
|
-
|
|
34
|
+
fasta_basename = os.path.basename(fasta)
|
|
35
|
+
converted_FASTA_basename = fasta_basename.split('.fa')[0]+'_converted.fasta'
|
|
36
|
+
converted_FASTA = os.path.join(output_directory, converted_FASTA_basename)
|
|
37
|
+
if os.path.exists(converted_FASTA):
|
|
38
|
+
print(converted_FASTA + ' already exists. Using existing converted FASTA.')
|
|
39
|
+
else:
|
|
40
|
+
generate_converted_FASTA(fasta, conversion_types, strands, converted_FASTA)
|
|
15
41
|
|
|
16
42
|
# 2) Basecall from the input POD5 to generate a singular output BAM
|
|
17
43
|
canoncall(model, pod5_dir, barcode_kit, bam, bam_suffix)
|
|
18
44
|
|
|
19
45
|
# 3) Align the BAM to the converted reference FASTA and sort the bam on positional coordinates. Also make an index and a bed file of mapped reads
|
|
20
|
-
|
|
46
|
+
input_BAM = bam + bam_suffix
|
|
47
|
+
align_and_sort_BAM(converted_FASTA, input_BAM, bam_suffix, output_directory)
|
|
21
48
|
|
|
22
49
|
### 4) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory###
|
|
23
50
|
split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix)
|
|
@@ -1,24 +1,50 @@
|
|
|
1
1
|
## pod5_direct
|
|
2
|
-
from .helpers import align_BAM, extract_mods, make_modbed, modcall, modkit_extract_to_adata, modQC, split_and_index_BAM
|
|
3
2
|
|
|
4
3
|
def pod5_direct(fasta, output_directory, mod_list, model, thresholds, pod5_dir, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix, batch_size):
|
|
5
4
|
"""
|
|
6
|
-
|
|
5
|
+
Converts a POD5 file from a nanopore native SMF experiment to an adata object.
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
fasta (str): File path to the reference genome to align to.
|
|
9
|
+
output_directory (str): A file path to the directory to output all the analyses.
|
|
10
|
+
mod_list (list): A list of strings of the modification types to use in the analysis.
|
|
11
|
+
model (str): a string representing the file path to the dorado basecalling model.
|
|
12
|
+
thresholds (list): A list of floats to pass for call thresholds.
|
|
13
|
+
pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
|
|
14
|
+
split_dir (str): A string representing the file path to the directory to split the BAMs into.
|
|
15
|
+
barcode_kit (str): A string representing the barcoding kit used in the experiment.
|
|
16
|
+
mapping_threshold (float): A value in between 0 and 1 to threshold the minimal fraction of aligned reads which map to the reference region. References with values above the threshold are included in the output adata.
|
|
17
|
+
experiment_name (str): A string to provide an experiment name to the output adata file.
|
|
18
|
+
bam_suffix (str): A suffix to add to the bam file.
|
|
19
|
+
batch_size (int): An integer number of TSV files to analyze in memory at once while loading the final adata object.
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
None
|
|
7
23
|
"""
|
|
8
|
-
|
|
24
|
+
from .helpers import align_and_sort_BAM, extract_mods, make_modbed, modcall, modkit_extract_to_adata, modQC, split_and_index_BAM, make_dirs
|
|
25
|
+
import os
|
|
26
|
+
model_basename = os.path.basename(model)
|
|
27
|
+
model_basename = model_basename.replace('.', '_')
|
|
28
|
+
mod_string = "_".join(mod_list)
|
|
29
|
+
bam=f"{output_directory}/{model_basename}_{mod_string}_calls"
|
|
9
30
|
aligned_BAM=f"{bam}_aligned"
|
|
10
31
|
aligned_sorted_BAM=f"{aligned_BAM}_sorted"
|
|
11
32
|
mod_bed_dir=f"{output_directory}/split_mod_beds"
|
|
12
33
|
mod_tsv_dir=f"{output_directory}/split_mod_tsvs"
|
|
13
34
|
|
|
35
|
+
make_dirs([mod_bed_dir, mod_tsv_dir])
|
|
36
|
+
|
|
14
37
|
aligned_sorted_output = aligned_sorted_BAM + bam_suffix
|
|
15
38
|
mod_map = {'6mA': '6mA', '5mC_5hmC': '5mC'}
|
|
16
39
|
mods = [mod_map[mod] for mod in mod_list]
|
|
17
40
|
|
|
41
|
+
os.chdir(output_directory)
|
|
42
|
+
|
|
18
43
|
# 1) Basecall using dorado
|
|
19
44
|
modcall(model, pod5_dir, barcode_kit, mod_list, bam, bam_suffix)
|
|
20
|
-
# 2) Align the BAM to the
|
|
21
|
-
|
|
45
|
+
# 2) Align the BAM to the reference FASTA. Also make an index and a bed file of mapped reads
|
|
46
|
+
input_BAM = bam + bam_suffix
|
|
47
|
+
align_and_sort_BAM(fasta, input_BAM, bam_suffix, output_directory)
|
|
22
48
|
# 3) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory
|
|
23
49
|
split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix)
|
|
24
50
|
# 4) Using nanopore modkit to work with modified BAM files ###
|
|
@@ -1,17 +1,40 @@
|
|
|
1
1
|
## pod5_to_adata
|
|
2
|
-
from .helpers import load_experiment_config
|
|
3
|
-
from.pod5_direct import pod5_direct
|
|
4
|
-
from.pod5_conversion import pod5_conversion
|
|
5
2
|
|
|
6
|
-
def pod5_to_adata(config_path
|
|
3
|
+
def pod5_to_adata(config_path):
|
|
7
4
|
"""
|
|
8
|
-
|
|
5
|
+
High-level function to call for converting raw sequencing data to an adata object.
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
config_path (str): A string representing the file path to the experiment configuration csv file.
|
|
9
|
+
|
|
10
|
+
Returns:
|
|
11
|
+
None
|
|
9
12
|
"""
|
|
13
|
+
from .helpers import LoadExperimentConfig, make_dirs
|
|
14
|
+
import os
|
|
15
|
+
bam_suffix = '.bam' # If different, change from here.
|
|
16
|
+
split_dir = 'split_BAMs' # If different, change from here.
|
|
17
|
+
strands = ['bottom', 'top'] # If different, change from here. Having both listed generally doesn't slow things down too much.
|
|
18
|
+
conversions = ['unconverted'] # The name to use for the unconverted files. If different, change from here.
|
|
19
|
+
|
|
10
20
|
# Load experiment config parameters into global variables
|
|
11
|
-
|
|
21
|
+
experiment_config = LoadExperimentConfig(config_path)
|
|
22
|
+
var_dict = experiment_config.var_dict
|
|
23
|
+
for key, value in var_dict.items():
|
|
24
|
+
globals()[key] = value
|
|
25
|
+
|
|
26
|
+
conversions += conversion_types
|
|
27
|
+
|
|
28
|
+
split_path = os.path.join(output_directory, split_dir)
|
|
29
|
+
make_dirs([output_directory, split_path])
|
|
30
|
+
os.chdir(output_directory)
|
|
31
|
+
|
|
12
32
|
if smf_modality == 'conversion':
|
|
13
|
-
|
|
33
|
+
from .pod5_conversion import pod5_conversion
|
|
34
|
+
pod5_conversion(fasta, output_directory, conversions, strands, model, pod5_dir, split_path, barcode_kit, mapping_threshold, experiment_name, bam_suffix)
|
|
14
35
|
elif smf_modality == 'direct':
|
|
15
|
-
pod5_direct
|
|
36
|
+
from .pod5_direct import pod5_direct
|
|
37
|
+
thresholds = [filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold]
|
|
38
|
+
pod5_direct(fasta, output_directory, mod_list, model, thresholds, pod5_dir, split_path, barcode_kit, mapping_threshold, experiment_name, bam_suffix, batch_size)
|
|
16
39
|
else:
|
|
17
40
|
print("Error")
|