smftools 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/_settings.py +3 -2
- smftools/_version.py +1 -1
- smftools/datasets/F1_sample_sheet.csv +5 -0
- smftools/datasets/datasets.py +8 -7
- smftools/informatics/__init__.py +7 -5
- smftools/informatics/{bam_conversion.py → archived/bam_conversion.py} +16 -4
- smftools/informatics/{bam_direct.py → archived/bam_direct.py} +22 -8
- smftools/informatics/archived/basecalls_to_adata.py +71 -0
- smftools/informatics/conversion_smf.py +79 -0
- smftools/informatics/direct_smf.py +89 -0
- smftools/informatics/fast5_to_pod5.py +8 -6
- smftools/informatics/helpers/__init__.py +18 -0
- smftools/informatics/helpers/align_and_sort_BAM.py +9 -13
- smftools/informatics/helpers/aligned_BAM_to_bed.py +73 -0
- smftools/informatics/helpers/bed_to_bigwig.py +39 -0
- smftools/informatics/helpers/binarize_converted_base_identities.py +2 -2
- smftools/informatics/helpers/canoncall.py +2 -0
- smftools/informatics/helpers/complement_base_list.py +21 -0
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +54 -0
- smftools/informatics/helpers/converted_BAM_to_adata.py +161 -92
- smftools/informatics/helpers/count_aligned_reads.py +13 -9
- smftools/informatics/helpers/extract_base_identities.py +34 -20
- smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
- smftools/informatics/helpers/find_conversion_sites.py +11 -9
- smftools/informatics/helpers/generate_converted_FASTA.py +33 -14
- smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
- smftools/informatics/helpers/index_fasta.py +12 -0
- smftools/informatics/helpers/modcall.py +3 -1
- smftools/informatics/helpers/modkit_extract_to_adata.py +467 -316
- smftools/informatics/helpers/ohe_batching.py +52 -0
- smftools/informatics/helpers/one_hot_encode.py +10 -8
- smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +52 -0
- smftools/informatics/helpers/separate_bam_by_bc.py +4 -2
- smftools/informatics/helpers/split_and_index_BAM.py +16 -4
- smftools/informatics/load_adata.py +127 -0
- smftools/informatics/subsample_fasta_from_bed.py +47 -0
- smftools/informatics/subsample_pod5.py +69 -13
- smftools/preprocessing/__init__.py +6 -1
- smftools/preprocessing/append_C_context.py +37 -14
- smftools/preprocessing/calculate_complexity.py +2 -2
- smftools/preprocessing/calculate_consensus.py +47 -0
- smftools/preprocessing/calculate_converted_read_methylation_stats.py +60 -9
- smftools/preprocessing/calculate_coverage.py +2 -2
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +1 -1
- smftools/preprocessing/calculate_read_length_stats.py +56 -2
- smftools/preprocessing/clean_NaN.py +2 -2
- smftools/preprocessing/filter_converted_reads_on_methylation.py +4 -2
- smftools/preprocessing/filter_reads_on_length.py +4 -2
- smftools/preprocessing/invert_adata.py +1 -0
- smftools/preprocessing/load_sample_sheet.py +24 -0
- smftools/preprocessing/make_dirs.py +21 -0
- smftools/preprocessing/mark_duplicates.py +34 -19
- smftools/preprocessing/recipes.py +125 -0
- smftools/preprocessing/remove_duplicates.py +7 -4
- smftools/tools/apply_HMM.py +1 -0
- smftools/tools/cluster.py +0 -0
- smftools/tools/read_HMM.py +1 -0
- smftools/tools/subset_adata.py +32 -0
- smftools/tools/train_HMM.py +43 -0
- {smftools-0.1.1.dist-info → smftools-0.1.3.dist-info}/METADATA +13 -7
- smftools-0.1.3.dist-info/RECORD +84 -0
- smftools/informatics/basecalls_to_adata.py +0 -42
- smftools/informatics/pod5_conversion.py +0 -53
- smftools/informatics/pod5_direct.py +0 -55
- smftools/informatics/pod5_to_adata.py +0 -40
- smftools-0.1.1.dist-info/RECORD +0 -64
- {smftools-0.1.1.dist-info → smftools-0.1.3.dist-info}/WHEEL +0 -0
- {smftools-0.1.1.dist-info → smftools-0.1.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -5,7 +5,7 @@ def binarize_converted_base_identities(base_identities, strand, modification_typ
|
|
|
5
5
|
Binarizes conversion SMF data within a sequence string
|
|
6
6
|
|
|
7
7
|
Parameters:
|
|
8
|
-
base_identities (dict): A dictionary returned by
|
|
8
|
+
base_identities (dict): A dictionary returned by extract_base_identities. Keyed by read name. Points to a list of base identities.
|
|
9
9
|
strand (str): A string indicating which strand was converted in the experiment (options are 'top' and 'bottom').
|
|
10
10
|
modification_type (str): A string indicating the modification type of interest (options are '5mC' and '6mA').
|
|
11
11
|
|
|
@@ -27,5 +27,5 @@ def binarize_converted_base_identities(base_identities, strand, modification_typ
|
|
|
27
27
|
elif modification_type == '6mA':
|
|
28
28
|
binarized_base_identities[key] = [1 if x == 'T' else 0 if x == 'C' else np.nan for x in base_identities[key]]
|
|
29
29
|
else:
|
|
30
|
-
|
|
30
|
+
print(f"{strand} not recognized")
|
|
31
31
|
return binarized_base_identities
|
|
@@ -19,5 +19,7 @@ def canoncall(model, pod5_dir, barcode_kit, bam, bam_suffix):
|
|
|
19
19
|
import subprocess
|
|
20
20
|
output = bam + bam_suffix
|
|
21
21
|
command = ["dorado", "basecaller", model, pod5_dir, "--kit-name", barcode_kit, "-Y"]
|
|
22
|
+
command_string = " ".join(command)
|
|
23
|
+
print(f"Running {command_string}\n to generate {output}")
|
|
22
24
|
with open(output, "w") as outfile:
|
|
23
25
|
subprocess.run(command, stdout=outfile)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# complement_base_list
|
|
2
|
+
|
|
3
|
+
def complement_base_list(sequence):
|
|
4
|
+
"""
|
|
5
|
+
Takes a list of DNA base identities and returns their complement.
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
sequence (list): A list of DNA bases (e.g., ['A', 'C', 'G', 'T']).
|
|
9
|
+
|
|
10
|
+
Returns:
|
|
11
|
+
complement (list): A list of complementary DNA bases.
|
|
12
|
+
"""
|
|
13
|
+
complement_mapping = {
|
|
14
|
+
'A': 'T',
|
|
15
|
+
'T': 'A',
|
|
16
|
+
'C': 'G',
|
|
17
|
+
'G': 'C',
|
|
18
|
+
'N': 'N' # Handling ambiguous bases like 'N'
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
return [complement_mapping[base] for base in sequence]
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# concatenate_fastqs_to_bam
|
|
2
|
+
|
|
3
|
+
def concatenate_fastqs_to_bam(fastq_files, output_bam, barcode_tag='BC', gzip_suffix='.gz'):
|
|
4
|
+
"""
|
|
5
|
+
Concatenate multiple demultiplexed FASTQ (.fastq or .fq) files into an unaligned BAM and add the FASTQ barcode suffix to the BC tag.
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
fastq_files (list): List of paths to demultiplexed FASTQ files.
|
|
9
|
+
output_bam (str): Path to the output BAM file.
|
|
10
|
+
barcode_tag (str): The SAM tag for storing the barcode (default: 'BC').
|
|
11
|
+
gzip_suffix (str): Suffix to use for input gzip files (Defaul: '.gz')
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
None
|
|
15
|
+
"""
|
|
16
|
+
import os
|
|
17
|
+
import pysam
|
|
18
|
+
import gzip
|
|
19
|
+
from Bio import SeqIO
|
|
20
|
+
from tqdm import tqdm
|
|
21
|
+
|
|
22
|
+
n_fastqs = len(fastq_files)
|
|
23
|
+
|
|
24
|
+
with pysam.AlignmentFile(output_bam, "wb", header={"HD": {"VN": "1.0"}, "SQ": []}) as bam_out:
|
|
25
|
+
for fastq_file in tqdm(fastq_files, desc="Processing FASTQ files"):
|
|
26
|
+
# Extract barcode from the FASTQ filename (handles .fq, .fastq, .fq.gz, and .fastq.gz extensions)
|
|
27
|
+
base_name = os.path.basename(fastq_file)
|
|
28
|
+
if n_fastqs > 1:
|
|
29
|
+
if base_name.endswith('.fastq.gz'):
|
|
30
|
+
barcode = base_name.split('_')[-1].replace(f'.fastq{gzip_suffix}', '')
|
|
31
|
+
elif base_name.endswith('.fq.gz'):
|
|
32
|
+
barcode = base_name.split('_')[-1].replace(f'.fq{gzip_suffix}', '')
|
|
33
|
+
elif base_name.endswith('.fastq'):
|
|
34
|
+
barcode = base_name.split('_')[-1].replace('.fastq', '')
|
|
35
|
+
elif base_name.endswith('.fq'):
|
|
36
|
+
barcode = base_name.split('_')[-1].replace('.fq', '')
|
|
37
|
+
else:
|
|
38
|
+
raise ValueError(f"Unexpected file extension for {fastq_file}. Only .fq, .fastq, .fq{gzip_suffix}, and .fastq{gzip_suffix} are supported.")
|
|
39
|
+
|
|
40
|
+
# Read the FASTQ file (handle gzipped and non-gzipped files)
|
|
41
|
+
open_func = gzip.open if fastq_file.endswith(gzip_suffix) else open
|
|
42
|
+
with open_func(fastq_file, 'rt') as fq_in:
|
|
43
|
+
for record in SeqIO.parse(fq_in, 'fastq'):
|
|
44
|
+
# Create an unaligned BAM entry for each FASTQ record
|
|
45
|
+
aln = pysam.AlignedSegment()
|
|
46
|
+
aln.query_name = record.id
|
|
47
|
+
aln.query_sequence = str(record.seq)
|
|
48
|
+
aln.flag = 4 # Unmapped
|
|
49
|
+
aln.query_qualities = pysam.qualitystring_to_array(record.letter_annotations["phred_quality"])
|
|
50
|
+
if n_fastqs > 1:
|
|
51
|
+
# Add the barcode to the BC tag
|
|
52
|
+
aln.set_tag(barcode_tag, barcode)
|
|
53
|
+
# Write to BAM file
|
|
54
|
+
bam_out.write(aln)
|
|
@@ -21,144 +21,213 @@ def converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experi
|
|
|
21
21
|
from .find_conversion_sites import find_conversion_sites
|
|
22
22
|
from .count_aligned_reads import count_aligned_reads
|
|
23
23
|
from .extract_base_identities import extract_base_identities
|
|
24
|
-
from .
|
|
24
|
+
from .make_dirs import make_dirs
|
|
25
|
+
from .ohe_batching import ohe_batching
|
|
25
26
|
import pandas as pd
|
|
26
27
|
import numpy as np
|
|
27
28
|
import anndata as ad
|
|
28
29
|
import os
|
|
29
|
-
|
|
30
|
+
from tqdm import tqdm
|
|
31
|
+
import gc
|
|
32
|
+
|
|
33
|
+
##########################################################################################
|
|
34
|
+
## Get file paths and make necessary directories. ##
|
|
30
35
|
# Get all of the input BAM files
|
|
31
36
|
files = os.listdir(split_dir)
|
|
32
|
-
#
|
|
33
|
-
os.
|
|
37
|
+
# Make output dir
|
|
38
|
+
parent_dir = os.path.dirname(split_dir)
|
|
39
|
+
h5_dir = os.path.join(parent_dir, 'h5ads')
|
|
40
|
+
tmp_dir = os.path.join(parent_dir, 'tmp')
|
|
41
|
+
make_dirs([h5_dir, tmp_dir])
|
|
34
42
|
# Filter file names that contain the search string in their filename and keep them in a list
|
|
35
43
|
bams = [bam for bam in files if bam_suffix in bam and '.bai' not in bam]
|
|
36
44
|
# Sort file list by names and print the list of file names
|
|
37
45
|
bams.sort()
|
|
46
|
+
bam_path_list = [os.path.join(split_dir, bam) for bam in bams]
|
|
38
47
|
print(f'Found the following BAMS: {bams}')
|
|
39
48
|
final_adata = None
|
|
49
|
+
##########################################################################################
|
|
50
|
+
|
|
51
|
+
##########################################################################################
|
|
40
52
|
|
|
41
|
-
|
|
53
|
+
## need to fix this section
|
|
54
|
+
# Make a dictionary, keyed by modification type, that points to another dictionary of unconverted_record_ids. This points to a list of: 1) record length, 2) top strand conversion coordinates, 3) bottom strand conversion coordinates, 4) sequence string unconverted , 5) Complement sequence unconverted
|
|
42
55
|
modification_dict = {}
|
|
56
|
+
# Init a dict to be keyed by FASTA record that points to the sequence string of the unconverted record
|
|
57
|
+
record_FASTA_dict = {}
|
|
43
58
|
# While populating the dictionary, also extract the longest sequence record in the input references
|
|
44
59
|
max_reference_length = 0
|
|
45
60
|
for conversion_type in conversion_types:
|
|
61
|
+
# Points to a list containing: 1) sequence length of the record, 2) top strand coordinate list, 3) bottom strand coorinate list, 4) sequence string unconverted , 5) Complement sequence unconverted
|
|
46
62
|
modification_dict[conversion_type] = find_conversion_sites(converted_FASTA, conversion_type, conversion_types)
|
|
63
|
+
# Get the max reference length
|
|
47
64
|
for record in modification_dict[conversion_type].keys():
|
|
48
65
|
if modification_dict[conversion_type][record][0] > max_reference_length:
|
|
49
66
|
max_reference_length = modification_dict[conversion_type][record][0]
|
|
50
67
|
|
|
51
|
-
|
|
52
|
-
record_FASTA_dict = {}
|
|
68
|
+
mod_type, strand = record.split('_')[-2:]
|
|
53
69
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
70
|
+
chromosome = record.split('_{0}_{1}'.format(mod_type, strand))[0]
|
|
71
|
+
unconverted_chromosome_name = f'{chromosome}_{conversion_types[0]}_top'
|
|
72
|
+
current_reference_length = modification_dict[mod_type][unconverted_chromosome_name][0]
|
|
73
|
+
delta_max_length = max_reference_length - current_reference_length
|
|
74
|
+
sequence = modification_dict[mod_type][unconverted_chromosome_name][3] + 'N'*delta_max_length
|
|
75
|
+
complement = modification_dict[mod_type][unconverted_chromosome_name][4] + 'N'*delta_max_length
|
|
76
|
+
record_FASTA_dict[record] = [sequence, complement, chromosome, unconverted_chromosome_name, current_reference_length, delta_max_length, conversion_type, strand]
|
|
77
|
+
##########################################################################################
|
|
78
|
+
|
|
79
|
+
##########################################################################################
|
|
80
|
+
bam_alignment_stats_dict = {}
|
|
81
|
+
records_to_analyze = []
|
|
82
|
+
for bam_index, bam in enumerate(bam_path_list):
|
|
83
|
+
bam_alignment_stats_dict[bam_index] = {}
|
|
58
84
|
# look at aligned read proportions in the bam
|
|
59
85
|
aligned_reads_count, unaligned_reads_count, record_counts = count_aligned_reads(bam)
|
|
60
86
|
percent_aligned = aligned_reads_count*100 / (aligned_reads_count+unaligned_reads_count)
|
|
61
|
-
print(f'{percent_aligned} percent of total reads in {
|
|
62
|
-
|
|
87
|
+
print(f'{percent_aligned} percent of total reads in {bams[bam_index]} aligned successfully')
|
|
88
|
+
bam_alignment_stats_dict[bam_index]['Total'] = (aligned_reads_count, percent_aligned)
|
|
63
89
|
# Iterate over converted reference strands and decide which to use in the analysis based on the mapping_threshold
|
|
64
90
|
for record in record_counts:
|
|
65
91
|
print(f'{record_counts[record][0]} reads mapped to reference record {record}. This is {record_counts[record][1]*100} percent of all mapped reads in the sample.')
|
|
66
92
|
if record_counts[record][1] >= mapping_threshold:
|
|
67
93
|
records_to_analyze.append(record)
|
|
68
|
-
|
|
69
|
-
|
|
94
|
+
bam_alignment_stats_dict[bam_index]
|
|
95
|
+
bam_alignment_stats_dict[bam_index][record] = (record_counts[record][0], record_counts[record][1]*100)
|
|
96
|
+
records_to_analyze = set(records_to_analyze)
|
|
97
|
+
##########################################################################################
|
|
98
|
+
|
|
99
|
+
##########################################################################################
|
|
100
|
+
# One hot encode read sequences and write them out into the tmp_dir as h5ad files.
|
|
101
|
+
# Save the file paths in the bam_record_ohe_files dict.
|
|
102
|
+
bam_record_ohe_files = {}
|
|
103
|
+
|
|
104
|
+
# Iterate over split bams
|
|
105
|
+
for bam_index, bam in enumerate(bam_path_list):
|
|
106
|
+
# Iterate over references to process
|
|
70
107
|
for record in records_to_analyze:
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
current_reference_length = modification_dict[mod_type][unconverted_chromosome_name][0]
|
|
81
|
-
delta_max_length = max_reference_length - current_reference_length
|
|
82
|
-
sequence = modification_dict[mod_type][unconverted_chromosome_name][3] + 'N'*delta_max_length
|
|
83
|
-
record_FASTA_dict[f'{record}'] = sequence
|
|
84
|
-
print(f'Chromosome: {chromosome}\nUnconverted Sequence: {sequence}')
|
|
108
|
+
unconverted_record_name = "_".join(record.split('_')[:-2]) + '_unconverted_top'
|
|
109
|
+
sample = bams[bam_index].split(sep=bam_suffix)[0]
|
|
110
|
+
chromosome = record_FASTA_dict[unconverted_record_name][2]
|
|
111
|
+
current_reference_length = record_FASTA_dict[unconverted_record_name][4]
|
|
112
|
+
mod_type = record_FASTA_dict[unconverted_record_name][6]
|
|
113
|
+
strand = record_FASTA_dict[unconverted_record_name][7]
|
|
114
|
+
|
|
115
|
+
# Extract the base identities of reads aligned to the record
|
|
116
|
+
fwd_base_identities, rev_base_identities = extract_base_identities(bam, record, range(current_reference_length), max_reference_length)
|
|
85
117
|
|
|
86
|
-
# Get a dictionary of positional identities keyed by read id
|
|
87
|
-
print(f'Extracting base identities of target positions')
|
|
88
|
-
target_base_identities = extract_base_identities(bam, record, positions, max_reference_length)
|
|
89
118
|
# binarize the dictionary of positional identities
|
|
90
|
-
print(f'Binarizing base identities
|
|
91
|
-
|
|
119
|
+
print(f'Binarizing base identities')
|
|
120
|
+
fwd_binarized_base_identities = binarize_converted_base_identities(fwd_base_identities, strand, mod_type)
|
|
121
|
+
rev_binarized_base_identities = binarize_converted_base_identities(rev_base_identities, strand, mod_type)
|
|
122
|
+
merged_binarized_base_identities = {**fwd_binarized_base_identities, **rev_binarized_base_identities}
|
|
92
123
|
# converts the base identity dictionary to a dataframe.
|
|
93
|
-
binarized_base_identities_df = pd.DataFrame.from_dict(
|
|
124
|
+
binarized_base_identities_df = pd.DataFrame.from_dict(merged_binarized_base_identities, orient='index')
|
|
94
125
|
sorted_index = sorted(binarized_base_identities_df.index)
|
|
95
126
|
binarized_base_identities_df = binarized_base_identities_df.reindex(sorted_index)
|
|
96
|
-
# Get the sequence string of every read
|
|
97
|
-
print(f'Extracting base identities of all positions in each read')
|
|
98
|
-
all_base_identities = extract_base_identities(bam, record, range(current_reference_length), max_reference_length)
|
|
99
|
-
# One hot encode the sequence string of the reads
|
|
100
|
-
print(f'One hot encoding base identities of all positions in each read')
|
|
101
|
-
one_hot_reads = {read_name: one_hot_encode(seq) for read_name, seq in all_base_identities.items()}
|
|
102
|
-
|
|
103
|
-
# Initialize empty DataFrames for each base
|
|
104
|
-
read_names = list(one_hot_reads.keys())
|
|
105
|
-
sequence_length = one_hot_reads[read_names[0]].shape[0]
|
|
106
|
-
df_A = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
|
|
107
|
-
df_C = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
|
|
108
|
-
df_G = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
|
|
109
|
-
df_T = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
|
|
110
|
-
df_N = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
|
|
111
|
-
|
|
112
|
-
# Iterate through the dictionary and populate the DataFrames
|
|
113
|
-
for read_name, one_hot_array in one_hot_reads.items():
|
|
114
|
-
df_A.loc[read_name] = one_hot_array[:, 0]
|
|
115
|
-
df_C.loc[read_name] = one_hot_array[:, 1]
|
|
116
|
-
df_G.loc[read_name] = one_hot_array[:, 2]
|
|
117
|
-
df_T.loc[read_name] = one_hot_array[:, 3]
|
|
118
|
-
df_N.loc[read_name] = one_hot_array[:, 4]
|
|
119
|
-
|
|
120
|
-
ohe_df_map = {0: df_A, 1: df_C, 2: df_G, 3: df_T, 4: df_N}
|
|
121
127
|
|
|
122
128
|
# Load an anndata object with the sample data
|
|
123
129
|
X = binarized_base_identities_df.values
|
|
124
130
|
adata = ad.AnnData(X, dtype=X.dtype)
|
|
125
|
-
adata.
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
131
|
+
if adata.shape[0] > 0:
|
|
132
|
+
adata.obs_names = binarized_base_identities_df.index.astype(str)
|
|
133
|
+
adata.var_names = binarized_base_identities_df.columns.astype(str)
|
|
134
|
+
adata.obs['Sample'] = [sample] * len(adata)
|
|
135
|
+
adata.obs['Strand'] = [strand] * len(adata)
|
|
136
|
+
adata.obs['Dataset'] = [mod_type] * len(adata)
|
|
137
|
+
adata.obs['Reference'] = [record] * len(adata)
|
|
138
|
+
adata.obs['Reference_chromosome'] = [chromosome] * len(adata)
|
|
139
|
+
|
|
140
|
+
read_mapping_direction = []
|
|
141
|
+
for read_id in adata.obs_names:
|
|
142
|
+
if read_id in fwd_base_identities.keys():
|
|
143
|
+
read_mapping_direction.append('fwd')
|
|
144
|
+
elif read_id in rev_base_identities.keys():
|
|
145
|
+
read_mapping_direction.append('rev')
|
|
146
|
+
else:
|
|
147
|
+
read_mapping_direction.append('unk')
|
|
148
|
+
|
|
149
|
+
adata.obs['Read_mapping_direction'] = read_mapping_direction
|
|
150
|
+
|
|
151
|
+
# One hot encode the sequence string of the reads
|
|
152
|
+
fwd_ohe_files = ohe_batching(fwd_base_identities, tmp_dir, record, f"{bam_index}_fwd",batch_size=100000)
|
|
153
|
+
rev_ohe_files = ohe_batching(rev_base_identities, tmp_dir, record, f"{bam_index}_rev",batch_size=100000)
|
|
154
|
+
bam_record_ohe_files[f'{bam_index}_{record}'] = fwd_ohe_files + rev_ohe_files
|
|
155
|
+
del fwd_base_identities, rev_base_identities
|
|
156
|
+
|
|
157
|
+
one_hot_reads = {}
|
|
158
|
+
n_rows_OHE = 5
|
|
159
|
+
for ohe_file in tqdm(bam_record_ohe_files[f'{bam_index}_{record}'], desc="Reading in OHE reads"):
|
|
160
|
+
tmp_ohe_dict = ad.read_h5ad(ohe_file).uns
|
|
161
|
+
one_hot_reads.update(tmp_ohe_dict)
|
|
162
|
+
del tmp_ohe_dict
|
|
163
|
+
|
|
164
|
+
read_names = list(one_hot_reads.keys())
|
|
165
|
+
dict_A, dict_C, dict_G, dict_T, dict_N = {}, {}, {}, {}, {}
|
|
166
|
+
|
|
167
|
+
sequence_length = one_hot_reads[read_names[0]].reshape(n_rows_OHE, -1).shape[1]
|
|
168
|
+
df_A = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
|
|
169
|
+
df_C = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
|
|
170
|
+
df_G = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
|
|
171
|
+
df_T = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
|
|
172
|
+
df_N = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
|
|
173
|
+
|
|
174
|
+
for read_name, one_hot_array in one_hot_reads.items():
|
|
175
|
+
one_hot_array = one_hot_array.reshape(n_rows_OHE, -1)
|
|
176
|
+
dict_A[read_name] = one_hot_array[0, :]
|
|
177
|
+
dict_C[read_name] = one_hot_array[1, :]
|
|
178
|
+
dict_G[read_name] = one_hot_array[2, :]
|
|
179
|
+
dict_T[read_name] = one_hot_array[3, :]
|
|
180
|
+
dict_N[read_name] = one_hot_array[4, :]
|
|
181
|
+
|
|
182
|
+
del one_hot_reads
|
|
183
|
+
gc.collect()
|
|
184
|
+
|
|
185
|
+
for j, read_name in tqdm(enumerate(sorted_index), desc='Loading dataframes of OHE reads', total=len(sorted_index)):
|
|
186
|
+
df_A.iloc[j] = dict_A[read_name]
|
|
187
|
+
df_C.iloc[j] = dict_C[read_name]
|
|
188
|
+
df_G.iloc[j] = dict_G[read_name]
|
|
189
|
+
df_T.iloc[j] = dict_T[read_name]
|
|
190
|
+
df_N.iloc[j] = dict_N[read_name]
|
|
191
|
+
|
|
192
|
+
del dict_A, dict_C, dict_G, dict_T, dict_N
|
|
193
|
+
gc.collect()
|
|
194
|
+
|
|
195
|
+
ohe_df_map = {0: df_A, 1: df_C, 2: df_G, 3: df_T, 4: df_N}
|
|
196
|
+
|
|
197
|
+
for j, base in enumerate(['A', 'C', 'G', 'T', 'N']):
|
|
198
|
+
adata.layers[f'{base}_binary_encoding'] = ohe_df_map[j].values
|
|
199
|
+
ohe_df_map[j] = None # Reassign pointer for memory usage purposes
|
|
200
|
+
|
|
201
|
+
if final_adata:
|
|
202
|
+
if adata.shape[0] > 0:
|
|
203
|
+
final_adata = ad.concat([final_adata, adata], join='outer', index_unique=None)
|
|
204
|
+
else:
|
|
205
|
+
print(f"{sample} did not have any mapped reads on {record}, omiting from final adata")
|
|
206
|
+
else:
|
|
207
|
+
if adata.shape[0] > 0:
|
|
208
|
+
final_adata = adata
|
|
209
|
+
else:
|
|
210
|
+
print(f"{sample} did not have any mapped reads on {record}, omiting from final adata")
|
|
137
211
|
|
|
138
|
-
if final_adata:
|
|
139
|
-
final_adata = ad.concat([final_adata, adata], join='outer', index_unique=None)
|
|
140
212
|
else:
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
213
|
+
print(f"{sample} did not have any mapped reads on {record}, omiting from final adata")
|
|
214
|
+
|
|
215
|
+
# Set obs columns to type 'category'
|
|
216
|
+
for col in final_adata.obs.columns:
|
|
217
|
+
final_adata.obs[col] = final_adata.obs[col].astype('category')
|
|
218
|
+
|
|
219
|
+
for record in records_to_analyze:
|
|
220
|
+
unconverted_record_name = "_".join(record.split('_')[:-2]) + '_unconverted_top'
|
|
221
|
+
sequence = record_FASTA_dict[unconverted_record_name][0]
|
|
222
|
+
complement = record_FASTA_dict[unconverted_record_name][1]
|
|
223
|
+
chromosome = record_FASTA_dict[unconverted_record_name][2]
|
|
224
|
+
final_adata.var[f'{chromosome}_unconverted_top_strand_FASTA_base'] = list(sequence)
|
|
225
|
+
final_adata.var[f'{chromosome}_unconverted_bottom_strand_FASTA_base'] = list(complement)
|
|
146
226
|
final_adata.uns[f'{record}_FASTA_sequence'] = sequence
|
|
147
|
-
final_adata.var[f'{record}_FASTA_sequence'] = list(sequence)
|
|
148
|
-
|
|
149
|
-
# May need to remove the bottom for conversion SMF
|
|
150
|
-
record_subset = final_adata[final_adata.obs['Reference'] == record].copy()
|
|
151
|
-
layer_map, layer_counts = {}, []
|
|
152
|
-
for i, layer in enumerate(record_subset.layers):
|
|
153
|
-
layer_map[i] = layer.split('_')[0]
|
|
154
|
-
layer_counts.append(np.sum(record_subset.layers[layer], axis=0))
|
|
155
|
-
count_array = np.array(layer_counts)
|
|
156
|
-
nucleotide_indexes = np.argmax(count_array, axis=0)
|
|
157
|
-
consensus_sequence_list = [layer_map[i] for i in nucleotide_indexes]
|
|
158
|
-
final_adata.var[f'{record}_consensus_across_samples'] = consensus_sequence_list
|
|
159
227
|
|
|
160
228
|
######################################################################################################
|
|
161
229
|
|
|
162
230
|
######################################################################################################
|
|
163
231
|
## Export the final adata object
|
|
164
|
-
|
|
232
|
+
final_output = os.path.join(h5_dir, f'{readwrite.date_string()}_{experiment_name}.h5ad.gz')
|
|
233
|
+
final_adata.write_h5ad(final_output, compression='gzip')
|
|
@@ -16,24 +16,28 @@ def count_aligned_reads(bam_file):
|
|
|
16
16
|
"""
|
|
17
17
|
from .. import readwrite
|
|
18
18
|
import pysam
|
|
19
|
+
from tqdm import tqdm
|
|
20
|
+
from collections import defaultdict
|
|
21
|
+
|
|
19
22
|
print('{0}: Counting aligned reads in BAM > {1}'.format(readwrite.time_string(), bam_file))
|
|
20
23
|
aligned_reads_count = 0
|
|
21
24
|
unaligned_reads_count = 0
|
|
22
25
|
# Make a dictionary, keyed by the reference_name of reference chromosome that points to an integer number of read counts mapped to the chromosome, as well as the proportion of mapped reads in that chromosome
|
|
23
|
-
record_counts =
|
|
26
|
+
record_counts = defaultdict(int)
|
|
27
|
+
|
|
24
28
|
with pysam.AlignmentFile(bam_file, "rb") as bam:
|
|
29
|
+
total_reads = bam.mapped + bam.unmapped
|
|
25
30
|
# Iterate over reads to get the total mapped read counts and the reads that map to each reference
|
|
26
|
-
for read in bam:
|
|
27
|
-
if read.is_unmapped:
|
|
31
|
+
for read in tqdm(bam, desc='Counting aligned reads in BAM', total=total_reads):
|
|
32
|
+
if read.is_unmapped:
|
|
28
33
|
unaligned_reads_count += 1
|
|
29
|
-
else:
|
|
34
|
+
else:
|
|
30
35
|
aligned_reads_count += 1
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
else:
|
|
34
|
-
record_counts[read.reference_name] = 1
|
|
36
|
+
record_counts[read.reference_name] += 1 # Automatically increments if key exists, adds if not
|
|
37
|
+
|
|
35
38
|
# reformat the dictionary to contain read counts mapped to the reference, as well as the proportion of mapped reads in reference
|
|
36
39
|
for reference in record_counts:
|
|
37
40
|
proportion_mapped_reads_in_record = record_counts[reference] / aligned_reads_count
|
|
38
41
|
record_counts[reference] = (record_counts[reference], proportion_mapped_reads_in_record)
|
|
39
|
-
|
|
42
|
+
|
|
43
|
+
return aligned_reads_count, unaligned_reads_count, dict(record_counts)
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
# General
|
|
4
4
|
def extract_base_identities(bam_file, chromosome, positions, max_reference_length):
|
|
5
5
|
"""
|
|
6
|
-
Extracts the base identities from every position within the
|
|
6
|
+
Extracts the base identities from every position within the mapped reads that have a reference coordinate
|
|
7
7
|
|
|
8
8
|
Parameters:
|
|
9
9
|
bam (str): File path to the BAM file to align (excluding the file suffix).
|
|
@@ -12,32 +12,46 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
|
|
|
12
12
|
max_reference_length (int): The maximum length of a record in the reference set.
|
|
13
13
|
|
|
14
14
|
Returns:
|
|
15
|
-
|
|
16
|
-
|
|
15
|
+
fwd_base_identities (dict): A dictionary, keyed by read name, that points to a list of base identities from forward mapped reads. If the read does not contain that position, fill the list at that index with a N value.
|
|
16
|
+
rev_base_identities (dict): A dictionary, keyed by read name, that points to a list of base identities from reverse mapped reads. If the read does not contain that position, fill the list at that index with a N value.
|
|
17
17
|
"""
|
|
18
18
|
from .. import readwrite
|
|
19
19
|
import pysam
|
|
20
|
+
from tqdm import tqdm
|
|
21
|
+
|
|
20
22
|
positions = set(positions)
|
|
21
23
|
# Initialize a base identity dictionary that will hold key-value pairs that are: key (read-name) and value (list of base identities at positions of interest)
|
|
22
|
-
|
|
24
|
+
fwd_base_identities = {}
|
|
25
|
+
rev_base_identities = {}
|
|
23
26
|
# Open the postion sorted BAM file
|
|
24
27
|
print('{0}: Reading BAM file: {1}'.format(readwrite.time_string(), bam_file))
|
|
25
28
|
with pysam.AlignmentFile(bam_file, "rb") as bam:
|
|
26
29
|
# Iterate over every read in the bam that comes from the chromosome of interest
|
|
27
30
|
print('{0}: Iterating over reads in bam'.format(readwrite.time_string()))
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
31
|
+
total_reads = bam.mapped
|
|
32
|
+
for read in tqdm(bam.fetch(chromosome), desc='Extracting base identities from reads in BAM', total=total_reads):
|
|
33
|
+
# Only iterate over mapped reads
|
|
34
|
+
if read.is_mapped:
|
|
35
|
+
# Get sequence of read. PySam reports fwd mapped reads as the true read sequence. Pysam reports rev mapped reads as the reverse complement of the read.
|
|
36
|
+
query_sequence = read.query_sequence
|
|
37
|
+
# If the read aligned as a reverse complement, mark that the read is reversed
|
|
38
|
+
if read.is_reverse:
|
|
39
|
+
# Initialize the read key in a temp base_identities dictionary by pointing to a N filled list of length reference_length.
|
|
40
|
+
rev_base_identities[read.query_name] = ['N'] * max_reference_length
|
|
41
|
+
# Iterate over a list of tuples for the given read. The tuples contain the 0-indexed position relative to the read.query_sequence start, as well the 0-based index relative to the reference.
|
|
42
|
+
for read_position, reference_position in read.get_aligned_pairs(matches_only=True):
|
|
43
|
+
# If the aligned read's reference coordinate is in the positions set and if the read position was successfully mapped
|
|
44
|
+
if reference_position in positions and read_position:
|
|
45
|
+
# get the base_identity in the read corresponding to that position
|
|
46
|
+
rev_base_identities[read.query_name][reference_position] = query_sequence[read_position]
|
|
47
|
+
else:
|
|
48
|
+
# Initialize the read key in a temp base_identities dictionary by pointing to a N filled list of length reference_length.
|
|
49
|
+
fwd_base_identities[read.query_name] = ['N'] * max_reference_length
|
|
50
|
+
# Iterate over a list of tuples for the given read. The tuples contain the 0-indexed position relative to the read.query_sequence start, as well the 0-based index relative to the reference.
|
|
51
|
+
for read_position, reference_position in read.get_aligned_pairs(matches_only=True):
|
|
52
|
+
# If the aligned read's reference coordinate is in the positions set and if the read position was successfully mapped
|
|
53
|
+
if reference_position in positions and read_position:
|
|
54
|
+
# get the base_identity in the read corresponding to that position
|
|
55
|
+
fwd_base_identities[read.query_name][reference_position] = query_sequence[read_position]
|
|
56
|
+
|
|
57
|
+
return fwd_base_identities, rev_base_identities
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# extract_readnames_from_BAM
|
|
2
|
+
|
|
3
|
+
def extract_readnames_from_BAM(aligned_BAM):
|
|
4
|
+
"""
|
|
5
|
+
Takes a BAM and writes out a txt file containing read names from the BAM
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
aligned_BAM (str): Path to an input aligned_BAM to extract read names from.
|
|
9
|
+
|
|
10
|
+
Returns:
|
|
11
|
+
None
|
|
12
|
+
|
|
13
|
+
"""
|
|
14
|
+
import subprocess
|
|
15
|
+
# Make a text file of reads for the BAM
|
|
16
|
+
txt_output = aligned_BAM.split('.bam')[0] + '_read_names.txt'
|
|
17
|
+
samtools_view = subprocess.Popen(["samtools", "view", aligned_BAM], stdout=subprocess.PIPE)
|
|
18
|
+
with open(txt_output, "w") as output_file:
|
|
19
|
+
cut_process = subprocess.Popen(["cut", "-f1"], stdin=samtools_view.stdout, stdout=output_file)
|
|
20
|
+
samtools_view.stdout.close()
|
|
21
|
+
cut_process.wait()
|
|
22
|
+
samtools_view.wait()
|
|
@@ -6,34 +6,35 @@ def find_conversion_sites(fasta_file, modification_type, conversion_types):
|
|
|
6
6
|
If searching for adenine conversions, it will find coordinates of all adenines.
|
|
7
7
|
|
|
8
8
|
Parameters:
|
|
9
|
-
fasta_file (str): A string representing the file path to the
|
|
9
|
+
fasta_file (str): A string representing the file path to the converted reference FASTA.
|
|
10
10
|
modification_type (str): A string representing the modification type of interest (options are '5mC' and '6mA').
|
|
11
11
|
conversion_types (list): A list of strings of the conversion types to use in the analysis. Used here to pass the unconverted record name.
|
|
12
12
|
|
|
13
13
|
Returns:
|
|
14
|
-
record_dict (dict): A dictionary keyed by unconverted record ids contained within the FASTA. Points to a list containing: 1) sequence length of the record, 2) top strand coordinate list, 3) bottom strand coorinate list, 4) sequence string
|
|
14
|
+
record_dict (dict): A dictionary keyed by unconverted record ids contained within the FASTA. Points to a list containing: 1) sequence length of the record, 2) top strand coordinate list, 3) bottom strand coorinate list, 4) sequence string, 5) Complement sequence
|
|
15
15
|
"""
|
|
16
16
|
from .. import readwrite
|
|
17
17
|
from Bio import SeqIO
|
|
18
18
|
from Bio.SeqRecord import SeqRecord
|
|
19
19
|
from Bio.Seq import Seq
|
|
20
20
|
|
|
21
|
-
print('{0}: Finding positions of interest in reference FASTA
|
|
21
|
+
#print('{0}: Finding positions of interest in reference FASTA: {1}'.format(readwrite.time_string(), fasta_file))
|
|
22
22
|
# Initialize lists to hold top and bottom strand positional coordinates of interest
|
|
23
23
|
top_strand_coordinates = []
|
|
24
24
|
bottom_strand_coordinates = []
|
|
25
25
|
unconverted = conversion_types[0]
|
|
26
26
|
record_dict = {}
|
|
27
|
-
print('{0}: Opening FASTA file {1}'.format(readwrite.time_string(), fasta_file))
|
|
27
|
+
#print('{0}: Opening FASTA file {1}'.format(readwrite.time_string(), fasta_file))
|
|
28
28
|
# Open the FASTA record as read only
|
|
29
29
|
with open(fasta_file, "r") as f:
|
|
30
30
|
# Iterate over records in the FASTA
|
|
31
31
|
for record in SeqIO.parse(f, "fasta"):
|
|
32
32
|
# Only iterate over the unconverted records for the reference
|
|
33
33
|
if unconverted in record.id:
|
|
34
|
-
print('{0}: Iterating over record {1} in FASTA file {2}'.format(readwrite.time_string(), record, fasta_file))
|
|
34
|
+
#print('{0}: Iterating over record {1} in FASTA file {2}'.format(readwrite.time_string(), record, fasta_file))
|
|
35
35
|
# Extract the sequence string of the record
|
|
36
36
|
sequence = str(record.seq).upper()
|
|
37
|
+
complement = str(record.seq.complement()).upper()
|
|
37
38
|
sequence_length = len(sequence)
|
|
38
39
|
if modification_type == '5mC':
|
|
39
40
|
# Iterate over the sequence string from the record
|
|
@@ -42,7 +43,7 @@ def find_conversion_sites(fasta_file, modification_type, conversion_types):
|
|
|
42
43
|
top_strand_coordinates.append(i) # 0-indexed coordinate
|
|
43
44
|
if sequence[i] == 'G':
|
|
44
45
|
bottom_strand_coordinates.append(i) # 0-indexed coordinate
|
|
45
|
-
print('{0}: Returning zero-indexed top and bottom strand FASTA coordinates for all cytosines'.format(readwrite.time_string()))
|
|
46
|
+
#print('{0}: Returning zero-indexed top and bottom strand FASTA coordinates for all cytosines'.format(readwrite.time_string()))
|
|
46
47
|
elif modification_type == '6mA':
|
|
47
48
|
# Iterate over the sequence string from the record
|
|
48
49
|
for i in range(0, len(sequence)):
|
|
@@ -50,10 +51,11 @@ def find_conversion_sites(fasta_file, modification_type, conversion_types):
|
|
|
50
51
|
top_strand_coordinates.append(i) # 0-indexed coordinate
|
|
51
52
|
if sequence[i] == 'T':
|
|
52
53
|
bottom_strand_coordinates.append(i) # 0-indexed coordinate
|
|
53
|
-
print('{0}: Returning zero-indexed top and bottom strand FASTA coordinates for adenines of interest'.format(readwrite.time_string()))
|
|
54
|
+
#print('{0}: Returning zero-indexed top and bottom strand FASTA coordinates for adenines of interest'.format(readwrite.time_string()))
|
|
54
55
|
else:
|
|
55
|
-
print('modification_type not found. Please try 5mC or 6mA')
|
|
56
|
-
|
|
56
|
+
#print('modification_type not found. Please try 5mC or 6mA')
|
|
57
|
+
pass
|
|
58
|
+
record_dict[record.id] = [sequence_length, top_strand_coordinates, bottom_strand_coordinates, sequence, complement]
|
|
57
59
|
else:
|
|
58
60
|
pass
|
|
59
61
|
return record_dict
|