smftools 0.1.3__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {smftools-0.1.3.dist-info → smftools-0.1.6.dist-info}/METADATA +44 -11
- smftools-0.1.6.dist-info/RECORD +4 -0
- smftools/__init__.py +0 -25
- smftools/_settings.py +0 -20
- smftools/_version.py +0 -1
- smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
- smftools/datasets/F1_sample_sheet.csv +0 -5
- smftools/datasets/__init__.py +0 -9
- smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
- smftools/datasets/datasets.py +0 -28
- smftools/informatics/__init__.py +0 -14
- smftools/informatics/archived/bam_conversion.py +0 -59
- smftools/informatics/archived/bam_direct.py +0 -63
- smftools/informatics/archived/basecalls_to_adata.py +0 -71
- smftools/informatics/conversion_smf.py +0 -79
- smftools/informatics/direct_smf.py +0 -89
- smftools/informatics/fast5_to_pod5.py +0 -21
- smftools/informatics/helpers/LoadExperimentConfig.py +0 -74
- smftools/informatics/helpers/__init__.py +0 -60
- smftools/informatics/helpers/align_and_sort_BAM.py +0 -48
- smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -73
- smftools/informatics/helpers/archived/informatics.py +0 -260
- smftools/informatics/helpers/archived/load_adata.py +0 -516
- smftools/informatics/helpers/bed_to_bigwig.py +0 -39
- smftools/informatics/helpers/binarize_converted_base_identities.py +0 -31
- smftools/informatics/helpers/canoncall.py +0 -25
- smftools/informatics/helpers/complement_base_list.py +0 -21
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -54
- smftools/informatics/helpers/converted_BAM_to_adata.py +0 -233
- smftools/informatics/helpers/count_aligned_reads.py +0 -43
- smftools/informatics/helpers/extract_base_identities.py +0 -57
- smftools/informatics/helpers/extract_mods.py +0 -51
- smftools/informatics/helpers/extract_readnames_from_BAM.py +0 -22
- smftools/informatics/helpers/find_conversion_sites.py +0 -61
- smftools/informatics/helpers/generate_converted_FASTA.py +0 -98
- smftools/informatics/helpers/get_chromosome_lengths.py +0 -32
- smftools/informatics/helpers/get_native_references.py +0 -28
- smftools/informatics/helpers/index_fasta.py +0 -12
- smftools/informatics/helpers/make_dirs.py +0 -21
- smftools/informatics/helpers/make_modbed.py +0 -27
- smftools/informatics/helpers/modQC.py +0 -27
- smftools/informatics/helpers/modcall.py +0 -28
- smftools/informatics/helpers/modkit_extract_to_adata.py +0 -518
- smftools/informatics/helpers/ohe_batching.py +0 -52
- smftools/informatics/helpers/one_hot_encode.py +0 -21
- smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -52
- smftools/informatics/helpers/separate_bam_by_bc.py +0 -43
- smftools/informatics/helpers/split_and_index_BAM.py +0 -41
- smftools/informatics/load_adata.py +0 -127
- smftools/informatics/readwrite.py +0 -106
- smftools/informatics/subsample_fasta_from_bed.py +0 -47
- smftools/informatics/subsample_pod5.py +0 -104
- smftools/plotting/__init__.py +0 -0
- smftools/preprocessing/__init__.py +0 -34
- smftools/preprocessing/append_C_context.py +0 -69
- smftools/preprocessing/archives/preprocessing.py +0 -614
- smftools/preprocessing/binarize_on_Youden.py +0 -42
- smftools/preprocessing/binary_layers_to_ohe.py +0 -30
- smftools/preprocessing/calculate_complexity.py +0 -71
- smftools/preprocessing/calculate_consensus.py +0 -47
- smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -96
- smftools/preprocessing/calculate_coverage.py +0 -41
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +0 -27
- smftools/preprocessing/calculate_position_Youden.py +0 -104
- smftools/preprocessing/calculate_read_length_stats.py +0 -86
- smftools/preprocessing/clean_NaN.py +0 -38
- smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -29
- smftools/preprocessing/filter_reads_on_length.py +0 -41
- smftools/preprocessing/invert_adata.py +0 -23
- smftools/preprocessing/load_sample_sheet.py +0 -24
- smftools/preprocessing/make_dirs.py +0 -21
- smftools/preprocessing/mark_duplicates.py +0 -134
- smftools/preprocessing/min_non_diagonal.py +0 -25
- smftools/preprocessing/recipes.py +0 -125
- smftools/preprocessing/remove_duplicates.py +0 -21
- smftools/readwrite.py +0 -106
- smftools/tools/__init__.py +0 -0
- smftools/tools/apply_HMM.py +0 -1
- smftools/tools/cluster.py +0 -0
- smftools/tools/read_HMM.py +0 -1
- smftools/tools/subset_adata.py +0 -32
- smftools/tools/train_HMM.py +0 -43
- smftools-0.1.3.dist-info/RECORD +0 -84
- {smftools-0.1.3.dist-info → smftools-0.1.6.dist-info}/WHEEL +0 -0
- {smftools-0.1.3.dist-info → smftools-0.1.6.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,74 +0,0 @@
|
|
|
1
|
-
## LoadExperimentConfig
|
|
2
|
-
|
|
3
|
-
class LoadExperimentConfig:
|
|
4
|
-
"""
|
|
5
|
-
Loads in the experiment configuration csv and saves global variables with experiment configuration parameters.
|
|
6
|
-
Parameters:
|
|
7
|
-
experiment_config (str): A string representing the file path to the experiment configuration csv file.
|
|
8
|
-
|
|
9
|
-
Attributes:
|
|
10
|
-
var_dict (dict): A dictionary containing experiment configuration parameters.
|
|
11
|
-
|
|
12
|
-
Example:
|
|
13
|
-
>>> import pandas as pd
|
|
14
|
-
>>> from io import StringIO
|
|
15
|
-
>>> csv_data = '''variable,value,type
|
|
16
|
-
... mapping_threshold,0.05,float
|
|
17
|
-
... batch_size,4,int
|
|
18
|
-
... testing_bool,True,bool
|
|
19
|
-
... strands,"[bottom, top]",list
|
|
20
|
-
... split_dir,split_bams,string
|
|
21
|
-
... pod5_dir,None,string
|
|
22
|
-
... pod5_dir,,string
|
|
23
|
-
... '''
|
|
24
|
-
>>> csv_file = StringIO(csv_data)
|
|
25
|
-
>>> df = pd.read_csv(csv_file)
|
|
26
|
-
>>> df.to_csv('test_config.csv', index=False)
|
|
27
|
-
>>> config_loader = LoadExperimentConfig('test_config.csv')
|
|
28
|
-
>>> config_loader.var_dict['mapping_threshold']
|
|
29
|
-
0.05
|
|
30
|
-
>>> config_loader.var_dict['batch_size']
|
|
31
|
-
4
|
|
32
|
-
>>> config_loader.var_dict['testing_bool']
|
|
33
|
-
True
|
|
34
|
-
>>> config_loader.var_dict['strands']
|
|
35
|
-
['bottom', 'top']
|
|
36
|
-
>>> config_loader.var_dict['split_dir']
|
|
37
|
-
'split_bams'
|
|
38
|
-
>>> config_loader.var_dict['pod5_dir'] is None
|
|
39
|
-
True
|
|
40
|
-
>>> config_loader.var_dict['pod5_dir'] is None
|
|
41
|
-
True
|
|
42
|
-
"""
|
|
43
|
-
def __init__(self, experiment_config):
|
|
44
|
-
import pandas as pd
|
|
45
|
-
# Read the CSV into a pandas DataFrame
|
|
46
|
-
df = pd.read_csv(experiment_config)
|
|
47
|
-
# Initialize an empty dictionary to store variables
|
|
48
|
-
var_dict = {}
|
|
49
|
-
# Iterate through each row in the DataFrame
|
|
50
|
-
for _, row in df.iterrows():
|
|
51
|
-
var_name = str(row['variable'])
|
|
52
|
-
value = row['value']
|
|
53
|
-
dtype = row['type']
|
|
54
|
-
# Handle empty and None values
|
|
55
|
-
if pd.isna(value) or value in ['None', '']:
|
|
56
|
-
value = None
|
|
57
|
-
else:
|
|
58
|
-
# Handle different data types
|
|
59
|
-
if dtype == 'list':
|
|
60
|
-
# Convert the string representation of a list to an actual list
|
|
61
|
-
value = value.strip('()[]').replace(', ', ',').split(',')
|
|
62
|
-
elif dtype == 'int':
|
|
63
|
-
value = int(value)
|
|
64
|
-
elif dtype == 'float':
|
|
65
|
-
value = float(value)
|
|
66
|
-
elif dtype == 'bool':
|
|
67
|
-
value = value.lower() == 'true'
|
|
68
|
-
elif dtype == 'string':
|
|
69
|
-
value = str(value)
|
|
70
|
-
# Store the variable in the dictionary
|
|
71
|
-
var_dict[var_name] = value
|
|
72
|
-
# Save the dictionary as an attribute of the class
|
|
73
|
-
self.var_dict = var_dict
|
|
74
|
-
|
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
from .align_and_sort_BAM import align_and_sort_BAM
|
|
2
|
-
from .aligned_BAM_to_bed import aligned_BAM_to_bed
|
|
3
|
-
from .bed_to_bigwig import bed_to_bigwig
|
|
4
|
-
from .binarize_converted_base_identities import binarize_converted_base_identities
|
|
5
|
-
from .canoncall import canoncall
|
|
6
|
-
from .complement_base_list import complement_base_list
|
|
7
|
-
from .converted_BAM_to_adata import converted_BAM_to_adata
|
|
8
|
-
from .concatenate_fastqs_to_bam import concatenate_fastqs_to_bam
|
|
9
|
-
from .count_aligned_reads import count_aligned_reads
|
|
10
|
-
from .extract_base_identities import extract_base_identities
|
|
11
|
-
from .extract_mods import extract_mods
|
|
12
|
-
from .extract_readnames_from_BAM import extract_readnames_from_BAM
|
|
13
|
-
from .find_conversion_sites import find_conversion_sites
|
|
14
|
-
from .generate_converted_FASTA import convert_FASTA_record, generate_converted_FASTA
|
|
15
|
-
from .get_chromosome_lengths import get_chromosome_lengths
|
|
16
|
-
from .get_native_references import get_native_references
|
|
17
|
-
from .index_fasta import index_fasta
|
|
18
|
-
from .LoadExperimentConfig import LoadExperimentConfig
|
|
19
|
-
from .make_dirs import make_dirs
|
|
20
|
-
from .make_modbed import make_modbed
|
|
21
|
-
from .modcall import modcall
|
|
22
|
-
from .modkit_extract_to_adata import modkit_extract_to_adata
|
|
23
|
-
from .modQC import modQC
|
|
24
|
-
from .one_hot_encode import one_hot_encode
|
|
25
|
-
from .ohe_batching import ohe_batching
|
|
26
|
-
from .plot_read_length_and_coverage_histograms import plot_read_length_and_coverage_histograms
|
|
27
|
-
from .separate_bam_by_bc import separate_bam_by_bc
|
|
28
|
-
from .split_and_index_BAM import split_and_index_BAM
|
|
29
|
-
|
|
30
|
-
__all__ = [
|
|
31
|
-
"align_and_sort_BAM",
|
|
32
|
-
"aligned_BAM_to_bed",
|
|
33
|
-
"bed_to_bigwig",
|
|
34
|
-
"binarize_converted_base_identities",
|
|
35
|
-
"canoncall",
|
|
36
|
-
"complement_base_list",
|
|
37
|
-
"converted_BAM_to_adata",
|
|
38
|
-
"concatenate_fastqs_to_bam",
|
|
39
|
-
"count_aligned_reads",
|
|
40
|
-
"extract_base_identities",
|
|
41
|
-
"extract_mods",
|
|
42
|
-
"extract_readnames_from_BAM",
|
|
43
|
-
"find_conversion_sites",
|
|
44
|
-
"convert_FASTA_record",
|
|
45
|
-
"generate_converted_FASTA",
|
|
46
|
-
"get_chromosome_lengths",
|
|
47
|
-
"get_native_references",
|
|
48
|
-
"index_fasta",
|
|
49
|
-
"LoadExperimentConfig",
|
|
50
|
-
"make_dirs",
|
|
51
|
-
"make_modbed",
|
|
52
|
-
"modcall",
|
|
53
|
-
"modkit_extract_to_adata",
|
|
54
|
-
"modQC",
|
|
55
|
-
"one_hot_encode",
|
|
56
|
-
"ohe_batching",
|
|
57
|
-
"plot_read_length_and_coverage_histograms",
|
|
58
|
-
"separate_bam_by_bc",
|
|
59
|
-
"split_and_index_BAM"
|
|
60
|
-
]
|
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
## align_and_sort_BAM
|
|
2
|
-
|
|
3
|
-
def align_and_sort_BAM(fasta, input, bam_suffix, output_directory):
|
|
4
|
-
"""
|
|
5
|
-
A wrapper for running dorado aligner and samtools functions
|
|
6
|
-
|
|
7
|
-
Parameters:
|
|
8
|
-
fasta (str): File path to the reference genome to align to.
|
|
9
|
-
input (str): File path to the basecalled file to align. Works for .bam and .fastq files
|
|
10
|
-
bam_suffix (str): The suffix to use for the BAM file.
|
|
11
|
-
output_directory (str): A file path to the directory to output all the analyses.
|
|
12
|
-
|
|
13
|
-
Returns:
|
|
14
|
-
None
|
|
15
|
-
The function writes out files for: 1) An aligned BAM, 2) and aligned_sorted BAM, 3) an index file for the aligned_sorted BAM, 4) A bed file for the aligned_sorted BAM, 5) A text file containing read names in the aligned_sorted BAM
|
|
16
|
-
"""
|
|
17
|
-
import subprocess
|
|
18
|
-
import os
|
|
19
|
-
from .aligned_BAM_to_bed import aligned_BAM_to_bed
|
|
20
|
-
from .extract_readnames_from_BAM import extract_readnames_from_BAM
|
|
21
|
-
from .make_dirs import make_dirs
|
|
22
|
-
input_basename = os.path.basename(input)
|
|
23
|
-
input_suffix = '.' + input_basename.split('.')[1]
|
|
24
|
-
|
|
25
|
-
output_path_minus_suffix = os.path.join(output_directory, input_basename.split(input_suffix)[0])
|
|
26
|
-
|
|
27
|
-
aligned_BAM=f"{output_path_minus_suffix}_aligned"
|
|
28
|
-
aligned_sorted_BAM=f"{aligned_BAM}_sorted"
|
|
29
|
-
aligned_output = aligned_BAM + bam_suffix
|
|
30
|
-
aligned_sorted_output = aligned_sorted_BAM + bam_suffix
|
|
31
|
-
|
|
32
|
-
# Run dorado aligner
|
|
33
|
-
subprocess.run(["dorado", "aligner", "--secondary", "no", fasta, input], stdout=open(aligned_output, "w"))
|
|
34
|
-
|
|
35
|
-
# Sort the BAM on positional coordinates
|
|
36
|
-
subprocess.run(["samtools", "sort", "-o", aligned_sorted_output, aligned_output])
|
|
37
|
-
|
|
38
|
-
# Create a BAM index file
|
|
39
|
-
subprocess.run(["samtools", "index", aligned_sorted_output])
|
|
40
|
-
|
|
41
|
-
# Make a bed file of coordinates for the BAM
|
|
42
|
-
plotting_dir = os.path.join(output_directory, 'coverage_and_readlength_histograms')
|
|
43
|
-
bed_dir = os.path.join(output_directory, 'read_alignment_coordinates')
|
|
44
|
-
make_dirs([plotting_dir, bed_dir])
|
|
45
|
-
aligned_BAM_to_bed(aligned_sorted_output, plotting_dir, bed_dir, fasta)
|
|
46
|
-
|
|
47
|
-
# Make a text file of reads for the BAM
|
|
48
|
-
extract_readnames_from_BAM(aligned_sorted_output)
|
|
@@ -1,73 +0,0 @@
|
|
|
1
|
-
# aligned_BAM_to_bed
|
|
2
|
-
|
|
3
|
-
def aligned_BAM_to_bed(aligned_BAM, plotting_dir, bed_dir, fasta):
|
|
4
|
-
"""
|
|
5
|
-
Takes an aligned BAM as input and writes a bed file of reads as output.
|
|
6
|
-
Bed columns are: Record name, start position, end position, read length, read name
|
|
7
|
-
|
|
8
|
-
Parameters:
|
|
9
|
-
aligned_BAM (str): Path to an input aligned_BAM to extract to a BED file.
|
|
10
|
-
plotting_dir (str): Path to write out read alignment length and coverage histograms
|
|
11
|
-
bed_dir (str): Path to write out read alignment coordinates
|
|
12
|
-
fasta (str): File path to the reference genome to align to.
|
|
13
|
-
|
|
14
|
-
Returns:
|
|
15
|
-
None
|
|
16
|
-
|
|
17
|
-
"""
|
|
18
|
-
import subprocess
|
|
19
|
-
import os
|
|
20
|
-
from .bed_to_bigwig import bed_to_bigwig
|
|
21
|
-
from .plot_read_length_and_coverage_histograms import plot_read_length_and_coverage_histograms
|
|
22
|
-
|
|
23
|
-
bed_output_basename = os.path.basename(aligned_BAM).split('.bam')[0] + '_bed.bed'
|
|
24
|
-
bed_output = os.path.join(bed_dir, bed_output_basename)
|
|
25
|
-
|
|
26
|
-
samtools_view = subprocess.Popen(["samtools", "view", aligned_BAM], stdout=subprocess.PIPE)
|
|
27
|
-
with open(bed_output, "w") as output_file:
|
|
28
|
-
awk_process = subprocess.Popen(["awk", '{print $3 "\t" $4 "\t" $4+length($10)-1 "\t" length($10)-1 "\t" $1}'], stdin=samtools_view.stdout, stdout=output_file)
|
|
29
|
-
samtools_view.stdout.close()
|
|
30
|
-
awk_process.wait()
|
|
31
|
-
samtools_view.wait()
|
|
32
|
-
|
|
33
|
-
def split_bed(bed, delete_input=True):
|
|
34
|
-
"""
|
|
35
|
-
Reads in a BED file and splits it into two separate BED files based on alignment status.
|
|
36
|
-
|
|
37
|
-
Parameters:
|
|
38
|
-
bed (str): Path to the input BED file.
|
|
39
|
-
delete_input (bool): Whether to delete the input bed file
|
|
40
|
-
|
|
41
|
-
Returns:
|
|
42
|
-
aligned (str): Path to the aligned bed file
|
|
43
|
-
"""
|
|
44
|
-
unaligned = bed.split('.bed')[0] + '_unaligned.bed'
|
|
45
|
-
aligned = bed.split('.bed')[0] + '_aligned.bed'
|
|
46
|
-
|
|
47
|
-
with open(bed, 'r') as infile, \
|
|
48
|
-
open(unaligned, 'w') as unaligned_outfile, \
|
|
49
|
-
open(aligned, 'w') as aligned_outfile:
|
|
50
|
-
|
|
51
|
-
for line in infile:
|
|
52
|
-
fields = line.strip().split('\t')
|
|
53
|
-
|
|
54
|
-
if fields[0] == '*':
|
|
55
|
-
unaligned_outfile.write(line)
|
|
56
|
-
else:
|
|
57
|
-
aligned_outfile.write(line)
|
|
58
|
-
|
|
59
|
-
if delete_input:
|
|
60
|
-
os.remove(bed)
|
|
61
|
-
|
|
62
|
-
return aligned
|
|
63
|
-
|
|
64
|
-
aligned_bed = split_bed(bed_output)
|
|
65
|
-
|
|
66
|
-
# Write out basic plots of reference coverage and read lengths
|
|
67
|
-
plot_read_length_and_coverage_histograms(aligned_bed, plotting_dir)
|
|
68
|
-
|
|
69
|
-
# Make a bedgraph and bigwig for the aligned reads
|
|
70
|
-
bed_to_bigwig(fasta, aligned_bed)
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
@@ -1,260 +0,0 @@
|
|
|
1
|
-
## fasta_module
|
|
2
|
-
from .. import readwrite
|
|
3
|
-
# bioinformatic operations
|
|
4
|
-
from Bio import SeqIO
|
|
5
|
-
from Bio.SeqRecord import SeqRecord
|
|
6
|
-
from Bio.Seq import Seq
|
|
7
|
-
import pysam
|
|
8
|
-
|
|
9
|
-
######################################################################################################
|
|
10
|
-
## FASTA functionality
|
|
11
|
-
# General
|
|
12
|
-
|
|
13
|
-
# Conversion specific
|
|
14
|
-
def modify_sequence_and_id(record, modification_type, strand):
|
|
15
|
-
"""
|
|
16
|
-
Input: Takes a FASTA record, modification type, and strand as input
|
|
17
|
-
Output: Returns a new seqrecord object with the conversions of interest
|
|
18
|
-
"""
|
|
19
|
-
if modification_type == '5mC':
|
|
20
|
-
if strand == 'top':
|
|
21
|
-
# Replace every 'C' with 'T' in the sequence
|
|
22
|
-
new_seq = record.seq.upper().replace('C', 'T')
|
|
23
|
-
elif strand == 'bottom':
|
|
24
|
-
# Replace every 'G' with 'A' in the sequence
|
|
25
|
-
new_seq = record.seq.upper().replace('G', 'A')
|
|
26
|
-
else:
|
|
27
|
-
print('need to provide a valid strand string: top or bottom')
|
|
28
|
-
elif modification_type == '6mA':
|
|
29
|
-
if strand == 'top':
|
|
30
|
-
# Replace every 'A' with 'G' in the sequence
|
|
31
|
-
new_seq = record.seq.upper().replace('A', 'G')
|
|
32
|
-
elif strand == 'bottom':
|
|
33
|
-
# Replace every 'T' with 'C' in the sequence
|
|
34
|
-
new_seq = record.seq.upper().replace('T', 'C')
|
|
35
|
-
else:
|
|
36
|
-
print('need to provide a valid strand string: top or bottom')
|
|
37
|
-
elif modification_type == 'unconverted':
|
|
38
|
-
new_seq = record.seq.upper()
|
|
39
|
-
else:
|
|
40
|
-
print('need to provide a valid modification_type string: 5mC, 6mA, or unconverted')
|
|
41
|
-
new_id = '{0}_{1}_{2}'.format(record.id, modification_type, strand)
|
|
42
|
-
# Return a new SeqRecord with modified sequence and ID
|
|
43
|
-
return record.__class__(new_seq, id=new_id, description=record.description)
|
|
44
|
-
|
|
45
|
-
def generate_converted_FASTA(input_fasta, modification_types, strands, output_fasta):
|
|
46
|
-
"""
|
|
47
|
-
Input: Takes an input FASTA, modification types of interest, strands of interest, and an output FASTA name
|
|
48
|
-
Output: Writes out a new fasta with all stranded conversions
|
|
49
|
-
Notes: Uses modify_sequence_and_id function on every record within the FASTA
|
|
50
|
-
"""
|
|
51
|
-
with open(output_fasta, 'w') as output_handle:
|
|
52
|
-
modified_records = []
|
|
53
|
-
# Iterate over each record in the input FASTA
|
|
54
|
-
for record in SeqIO.parse(input_fasta, 'fasta'):
|
|
55
|
-
# Iterate over each modification type of interest
|
|
56
|
-
for modification_type in modification_types:
|
|
57
|
-
# Iterate over the strands of interest
|
|
58
|
-
for i, strand in enumerate(strands):
|
|
59
|
-
if i > 0 and modification_type == 'unconverted': # This ensures that the unconverted only is added once and takes on the strand that is provided at the 0 index on strands.
|
|
60
|
-
pass
|
|
61
|
-
else:
|
|
62
|
-
# Add the modified record to the list of modified records
|
|
63
|
-
print(f'converting {modification_type} on the {strand} strand of record {record}')
|
|
64
|
-
modified_records.append(modify_sequence_and_id(record, modification_type, strand))
|
|
65
|
-
# write out the concatenated FASTA file of modified sequences
|
|
66
|
-
SeqIO.write(modified_records, output_handle, 'fasta')
|
|
67
|
-
|
|
68
|
-
def find_coordinates(fasta_file, modification_type):
|
|
69
|
-
"""
|
|
70
|
-
A function to find genomic coordinates in every unconverted record contained within a FASTA file of every cytosine.
|
|
71
|
-
If searching for adenine conversions, it will find coordinates of all adenines.
|
|
72
|
-
Input: A FASTA file and the modification_types of interest
|
|
73
|
-
Returns:
|
|
74
|
-
A dictionary called record_dict, which is keyed by unconverted record ids contained within the FASTA. Points to a list containing: 1) sequence length of the record, 2) top strand coordinate list, 3) bottom strand coorinate list, 4) sequence string
|
|
75
|
-
"""
|
|
76
|
-
print('{0}: Finding positions of interest in reference FASTA > {1}'.format(time_string(), fasta_file))
|
|
77
|
-
# Initialize lists to hold top and bottom strand positional coordinates of interest
|
|
78
|
-
top_strand_coordinates = []
|
|
79
|
-
bottom_strand_coordinates = []
|
|
80
|
-
record_dict = {}
|
|
81
|
-
print('{0}: Opening FASTA file {1}'.format(time_string(), fasta_file))
|
|
82
|
-
# Open the FASTA record as read only
|
|
83
|
-
with open(fasta_file, "r") as f:
|
|
84
|
-
# Iterate over records in the FASTA
|
|
85
|
-
for record in SeqIO.parse(f, "fasta"):
|
|
86
|
-
# Only iterate over the unconverted records for the reference
|
|
87
|
-
if 'unconverted' in record.id:
|
|
88
|
-
print('{0}: Iterating over record {1} in FASTA file {2}'.format(time_string(), record, fasta_file))
|
|
89
|
-
# Extract the sequence string of the record
|
|
90
|
-
sequence = str(record.seq).upper()
|
|
91
|
-
sequence_length = len(sequence)
|
|
92
|
-
if modification_type == '5mC':
|
|
93
|
-
# Iterate over the sequence string from the record
|
|
94
|
-
for i in range(0, len(sequence)):
|
|
95
|
-
if sequence[i] == 'C':
|
|
96
|
-
top_strand_coordinates.append(i) # 0-indexed coordinate
|
|
97
|
-
if sequence[i] == 'G':
|
|
98
|
-
bottom_strand_coordinates.append(i) # 0-indexed coordinate
|
|
99
|
-
print('{0}: Returning zero-indexed top and bottom strand FASTA coordinates for all cytosines'.format(time_string()))
|
|
100
|
-
elif modification_type == '6mA':
|
|
101
|
-
# Iterate over the sequence string from the record
|
|
102
|
-
for i in range(0, len(sequence)):
|
|
103
|
-
if sequence[i] == 'A':
|
|
104
|
-
top_strand_coordinates.append(i) # 0-indexed coordinate
|
|
105
|
-
if sequence[i] == 'T':
|
|
106
|
-
bottom_strand_coordinates.append(i) # 0-indexed coordinate
|
|
107
|
-
print('{0}: Returning zero-indexed top and bottom strand FASTA coordinates for adenines of interest'.format(time_string()))
|
|
108
|
-
else:
|
|
109
|
-
print('modification_type not found. Please try 5mC or 6mA')
|
|
110
|
-
record_dict[record.id] = [sequence_length, top_strand_coordinates, bottom_strand_coordinates, sequence]
|
|
111
|
-
else:
|
|
112
|
-
pass
|
|
113
|
-
return record_dict
|
|
114
|
-
|
|
115
|
-
# Direct methylation specific
|
|
116
|
-
def get_references(fasta_file):
|
|
117
|
-
"""
|
|
118
|
-
Input: A FASTA file
|
|
119
|
-
Returns:
|
|
120
|
-
A dictionary called record_dict, which is keyed by record ids contained within the FASTA. Points to a list containing: 1) sequence length of the record, 2) sequence of the record
|
|
121
|
-
"""
|
|
122
|
-
record_dict = {}
|
|
123
|
-
print('{0}: Opening FASTA file {1}'.format(time_string(), fasta_file))
|
|
124
|
-
# Open the FASTA record as read only
|
|
125
|
-
with open(fasta_file, "r") as f:
|
|
126
|
-
# Iterate over records in the FASTA
|
|
127
|
-
for record in SeqIO.parse(f, "fasta"):
|
|
128
|
-
# Extract the sequence string of the record
|
|
129
|
-
sequence = str(record.seq).upper()
|
|
130
|
-
sequence_length = len(sequence)
|
|
131
|
-
record_dict[record.id] = [sequence_length, sequence]
|
|
132
|
-
return record_dict
|
|
133
|
-
######################################################################################################
|
|
134
|
-
|
|
135
|
-
######################################################################################################
|
|
136
|
-
## BAM functionality
|
|
137
|
-
# General
|
|
138
|
-
def separate_bam_by_bc(input_bam, output_prefix):
|
|
139
|
-
"""
|
|
140
|
-
Input: Takes a single BAM input. Also takes an output prefix to append to the output file.
|
|
141
|
-
Output: Splits the BAM based on the BC SAM tag value.
|
|
142
|
-
"""
|
|
143
|
-
# Open the input BAM file for reading
|
|
144
|
-
with pysam.AlignmentFile(input_bam, "rb") as bam:
|
|
145
|
-
# Create a dictionary to store output BAM files
|
|
146
|
-
output_files = {}
|
|
147
|
-
# Iterate over each read in the BAM file
|
|
148
|
-
for read in bam:
|
|
149
|
-
try:
|
|
150
|
-
# Get the barcode tag value
|
|
151
|
-
bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
|
|
152
|
-
# Open the output BAM file corresponding to the barcode
|
|
153
|
-
if bc_tag not in output_files:
|
|
154
|
-
output_files[bc_tag] = pysam.AlignmentFile(f"{output_prefix}_{bc_tag}.bam", "wb", header=bam.header)
|
|
155
|
-
# Write the read to the corresponding output BAM file
|
|
156
|
-
output_files[bc_tag].write(read)
|
|
157
|
-
except KeyError:
|
|
158
|
-
print(f"BC tag not present for read: {read.query_name}")
|
|
159
|
-
# Close all output BAM files
|
|
160
|
-
for output_file in output_files.values():
|
|
161
|
-
output_file.close()
|
|
162
|
-
|
|
163
|
-
def count_aligned_reads(bam_file):
|
|
164
|
-
"""
|
|
165
|
-
Input: A BAM alignment file.
|
|
166
|
-
Output: The number of aligned/unaligned reads in the BAM file. Also returns a dictionary, keyed by reference id that points to a tuple. The tuple contains an integer number of mapped reads to that reference, followed by the proportion of mapped reads that map to that reference
|
|
167
|
-
"""
|
|
168
|
-
print('{0}: Counting aligned reads in BAM > {1}'.format(time_string(), bam_file))
|
|
169
|
-
aligned_reads_count = 0
|
|
170
|
-
unaligned_reads_count = 0
|
|
171
|
-
# Make a dictionary, keyed by the reference_name of reference chromosome that points to an integer number of read counts mapped to the chromosome, as well as the proportion of mapped reads in that chromosome
|
|
172
|
-
record_counts = {}
|
|
173
|
-
with pysam.AlignmentFile(bam_file, "rb") as bam:
|
|
174
|
-
# Iterate over reads to get the total mapped read counts and the reads that map to each reference
|
|
175
|
-
for read in bam:
|
|
176
|
-
if read.is_unmapped:
|
|
177
|
-
unaligned_reads_count += 1
|
|
178
|
-
else:
|
|
179
|
-
aligned_reads_count += 1
|
|
180
|
-
if read.reference_name in record_counts:
|
|
181
|
-
record_counts[read.reference_name] += 1
|
|
182
|
-
else:
|
|
183
|
-
record_counts[read.reference_name] = 1
|
|
184
|
-
# reformat the dictionary to contain read counts mapped to the reference, as well as the proportion of mapped reads in reference
|
|
185
|
-
for reference in record_counts:
|
|
186
|
-
proportion_mapped_reads_in_record = record_counts[reference] / aligned_reads_count
|
|
187
|
-
record_counts[reference] = (record_counts[reference], proportion_mapped_reads_in_record)
|
|
188
|
-
return aligned_reads_count, unaligned_reads_count, record_counts
|
|
189
|
-
|
|
190
|
-
def extract_base_identity_at_coordinates(bam_file, chromosome, positions, max_reference_length):
|
|
191
|
-
"""
|
|
192
|
-
Input: A position sorted BAM file, chromosome number, position coordinate set, and reference length to extract the base identitity from the read.
|
|
193
|
-
Output: A dictionary, keyed by read name, that points to a list of Base identities from each read.
|
|
194
|
-
If the read does not contain that position, fill the list at that index with a N value.
|
|
195
|
-
"""
|
|
196
|
-
positions = set(positions)
|
|
197
|
-
# Initialize a base identity dictionary that will hold key-value pairs that are: key (read-name) and value (list of base identities at positions of interest)
|
|
198
|
-
base_identities = {}
|
|
199
|
-
# Open the postion sorted BAM file
|
|
200
|
-
print('{0}: Reading BAM file: {1}'.format(time_string(), bam_file))
|
|
201
|
-
with pysam.AlignmentFile(bam_file, "rb") as bam:
|
|
202
|
-
# Iterate over every read in the bam that comes from the chromosome of interest
|
|
203
|
-
print('{0}: Iterating over reads in bam'.format(time_string()))
|
|
204
|
-
for read in bam.fetch(chromosome):
|
|
205
|
-
if read.query_name in base_identities:
|
|
206
|
-
pass
|
|
207
|
-
#print('Duplicate read found in BAM for read {}. Skipping duplicate'.format(read.query_name))
|
|
208
|
-
else:
|
|
209
|
-
# Initialize the read key in the base_identities dictionary by pointing to a N filled list of length reference_length
|
|
210
|
-
base_identities[read.query_name] = ['N'] * max_reference_length
|
|
211
|
-
# Iterate over a list of tuples for the given read. The tuples contain the 0-indexed position relative to the read start, as well the 0-based index relative to the reference.
|
|
212
|
-
for read_position, reference_position in read.get_aligned_pairs():
|
|
213
|
-
# If the aligned read's reference coordinate is in the positions set and if the read position was successfully mapped
|
|
214
|
-
if reference_position in positions and read_position:
|
|
215
|
-
# get the base_identity in the read corresponding to that position
|
|
216
|
-
base_identity = read.query_sequence[read_position]
|
|
217
|
-
# Add the base identity to array
|
|
218
|
-
base_identities[read.query_name][reference_position] = base_identity
|
|
219
|
-
return base_identities
|
|
220
|
-
|
|
221
|
-
# Conversion SMF specific
|
|
222
|
-
def binarize_converted_base_identities(base_identities, strand, modification_type):
|
|
223
|
-
"""
|
|
224
|
-
Input: The base identities dictionary returned by extract_base_identity_at_coordinates.
|
|
225
|
-
Output: A binarized format of the dictionary, where 1 represents a methylated site. 0 represents an unmethylated site. NaN represents a site that does not carry SMF information.
|
|
226
|
-
"""
|
|
227
|
-
binarized_base_identities = {}
|
|
228
|
-
# Iterate over base identity keys to binarize the base identities
|
|
229
|
-
for key in base_identities.keys():
|
|
230
|
-
if strand == 'top':
|
|
231
|
-
if modification_type == '5mC':
|
|
232
|
-
binarized_base_identities[key] = [1 if x == 'C' else 0 if x == 'T' else np.nan for x in base_identities[key]]
|
|
233
|
-
elif modification_type == '6mA':
|
|
234
|
-
binarized_base_identities[key] = [1 if x == 'A' else 0 if x == 'G' else np.nan for x in base_identities[key]]
|
|
235
|
-
elif strand == 'bottom':
|
|
236
|
-
if modification_type == '5mC':
|
|
237
|
-
binarized_base_identities[key] = [1 if x == 'G' else 0 if x == 'A' else np.nan for x in base_identities[key]]
|
|
238
|
-
elif modification_type == '6mA':
|
|
239
|
-
binarized_base_identities[key] = [1 if x == 'T' else 0 if x == 'C' else np.nan for x in base_identities[key]]
|
|
240
|
-
else:
|
|
241
|
-
pass
|
|
242
|
-
return binarized_base_identities
|
|
243
|
-
|
|
244
|
-
# Direct methylation specific
|
|
245
|
-
|
|
246
|
-
######################################################################################################
|
|
247
|
-
|
|
248
|
-
######################################################################################################
|
|
249
|
-
# String encodings
|
|
250
|
-
def one_hot_encode(sequence):
|
|
251
|
-
"""
|
|
252
|
-
Input: A sequence string of a read.
|
|
253
|
-
Output: One hot encoding of the sequence string.
|
|
254
|
-
"""
|
|
255
|
-
mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3, 'N': 4}
|
|
256
|
-
one_hot_matrix = np.zeros((len(sequence), 5), dtype=int)
|
|
257
|
-
for i, nucleotide in enumerate(sequence):
|
|
258
|
-
one_hot_matrix[i, mapping[nucleotide]] = 1
|
|
259
|
-
return one_hot_matrix
|
|
260
|
-
######################################################################################################
|