smftools 0.1.3__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +5 -1
- smftools/_version.py +1 -1
- smftools/informatics/__init__.py +2 -0
- smftools/informatics/archived/print_bam_query_seq.py +29 -0
- smftools/informatics/basecall_pod5s.py +80 -0
- smftools/informatics/conversion_smf.py +63 -10
- smftools/informatics/direct_smf.py +66 -18
- smftools/informatics/helpers/LoadExperimentConfig.py +1 -0
- smftools/informatics/helpers/__init__.py +16 -2
- smftools/informatics/helpers/align_and_sort_BAM.py +27 -16
- smftools/informatics/helpers/aligned_BAM_to_bed.py +49 -48
- smftools/informatics/helpers/bam_qc.py +66 -0
- smftools/informatics/helpers/binarize_converted_base_identities.py +69 -21
- smftools/informatics/helpers/canoncall.py +12 -3
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +5 -4
- smftools/informatics/helpers/converted_BAM_to_adata.py +34 -22
- smftools/informatics/helpers/converted_BAM_to_adata_II.py +369 -0
- smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
- smftools/informatics/helpers/extract_base_identities.py +33 -46
- smftools/informatics/helpers/extract_mods.py +55 -23
- smftools/informatics/helpers/extract_read_features_from_bam.py +31 -0
- smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
- smftools/informatics/helpers/find_conversion_sites.py +33 -44
- smftools/informatics/helpers/generate_converted_FASTA.py +87 -86
- smftools/informatics/helpers/modcall.py +13 -5
- smftools/informatics/helpers/modkit_extract_to_adata.py +762 -396
- smftools/informatics/helpers/ohe_batching.py +65 -41
- smftools/informatics/helpers/ohe_layers_decode.py +32 -0
- smftools/informatics/helpers/one_hot_decode.py +27 -0
- smftools/informatics/helpers/one_hot_encode.py +45 -9
- smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +1 -0
- smftools/informatics/helpers/run_multiqc.py +28 -0
- smftools/informatics/helpers/split_and_index_BAM.py +3 -8
- smftools/informatics/load_adata.py +58 -3
- smftools/plotting/__init__.py +15 -0
- smftools/plotting/classifiers.py +355 -0
- smftools/plotting/general_plotting.py +205 -0
- smftools/plotting/position_stats.py +462 -0
- smftools/preprocessing/__init__.py +6 -7
- smftools/preprocessing/append_C_context.py +22 -9
- smftools/preprocessing/{mark_duplicates.py → archives/mark_duplicates.py} +38 -26
- smftools/preprocessing/binarize_on_Youden.py +35 -32
- smftools/preprocessing/binary_layers_to_ohe.py +13 -3
- smftools/preprocessing/calculate_complexity.py +3 -2
- smftools/preprocessing/calculate_converted_read_methylation_stats.py +44 -46
- smftools/preprocessing/calculate_coverage.py +26 -25
- smftools/preprocessing/calculate_pairwise_differences.py +49 -0
- smftools/preprocessing/calculate_position_Youden.py +18 -7
- smftools/preprocessing/calculate_read_length_stats.py +39 -46
- smftools/preprocessing/clean_NaN.py +33 -25
- smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
- smftools/preprocessing/filter_converted_reads_on_methylation.py +20 -5
- smftools/preprocessing/filter_reads_on_length.py +14 -4
- smftools/preprocessing/flag_duplicate_reads.py +149 -0
- smftools/preprocessing/invert_adata.py +18 -11
- smftools/preprocessing/load_sample_sheet.py +30 -16
- smftools/preprocessing/recipes.py +22 -20
- smftools/preprocessing/subsample_adata.py +58 -0
- smftools/readwrite.py +105 -13
- smftools/tools/__init__.py +49 -0
- smftools/tools/apply_hmm.py +202 -0
- smftools/tools/apply_hmm_batched.py +241 -0
- smftools/tools/archived/classify_methylated_features.py +66 -0
- smftools/tools/archived/classify_non_methylated_features.py +75 -0
- smftools/tools/archived/subset_adata_v1.py +32 -0
- smftools/tools/archived/subset_adata_v2.py +46 -0
- smftools/tools/calculate_distances.py +18 -0
- smftools/tools/calculate_umap.py +62 -0
- smftools/tools/call_hmm_peaks.py +105 -0
- smftools/tools/classifiers.py +787 -0
- smftools/tools/cluster_adata_on_methylation.py +105 -0
- smftools/tools/data/__init__.py +2 -0
- smftools/tools/data/anndata_data_module.py +90 -0
- smftools/tools/data/preprocessing.py +6 -0
- smftools/tools/display_hmm.py +18 -0
- smftools/tools/general_tools.py +69 -0
- smftools/tools/hmm_readwrite.py +16 -0
- smftools/tools/inference/__init__.py +1 -0
- smftools/tools/inference/lightning_inference.py +41 -0
- smftools/tools/models/__init__.py +9 -0
- smftools/tools/models/base.py +14 -0
- smftools/tools/models/cnn.py +34 -0
- smftools/tools/models/lightning_base.py +41 -0
- smftools/tools/models/mlp.py +17 -0
- smftools/tools/models/positional.py +17 -0
- smftools/tools/models/rnn.py +16 -0
- smftools/tools/models/sklearn_models.py +40 -0
- smftools/tools/models/transformer.py +133 -0
- smftools/tools/models/wrappers.py +20 -0
- smftools/tools/nucleosome_hmm_refinement.py +104 -0
- smftools/tools/position_stats.py +239 -0
- smftools/tools/read_stats.py +70 -0
- smftools/tools/subset_adata.py +19 -23
- smftools/tools/train_hmm.py +78 -0
- smftools/tools/training/__init__.py +1 -0
- smftools/tools/training/train_lightning_model.py +47 -0
- smftools/tools/utils/__init__.py +2 -0
- smftools/tools/utils/device.py +10 -0
- smftools/tools/utils/grl.py +14 -0
- {smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/METADATA +47 -11
- smftools-0.1.7.dist-info/RECORD +136 -0
- smftools/tools/apply_HMM.py +0 -1
- smftools/tools/read_HMM.py +0 -1
- smftools/tools/train_HMM.py +0 -43
- smftools-0.1.3.dist-info/RECORD +0 -84
- /smftools/preprocessing/{remove_duplicates.py → archives/remove_duplicates.py} +0 -0
- /smftools/tools/{cluster.py → evaluation/__init__.py} +0 -0
- {smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/WHEEL +0 -0
- {smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/licenses/LICENSE +0 -0
smftools/__init__.py
CHANGED
|
@@ -8,6 +8,7 @@ from . import preprocessing as pp
|
|
|
8
8
|
from . import tools as tl
|
|
9
9
|
from . import plotting as pl
|
|
10
10
|
from . import readwrite, datasets
|
|
11
|
+
from .readwrite import adata_to_df, safe_write_h5ad, merge_barcoded_anndatas
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
from importlib.metadata import version
|
|
@@ -16,10 +17,13 @@ package_name = "smftools"
|
|
|
16
17
|
__version__ = version(package_name)
|
|
17
18
|
|
|
18
19
|
__all__ = [
|
|
20
|
+
"adata_to_df",
|
|
19
21
|
"inform",
|
|
20
22
|
"pp",
|
|
21
23
|
"tl",
|
|
22
24
|
"pl",
|
|
23
25
|
"readwrite",
|
|
24
|
-
"datasets"
|
|
26
|
+
"datasets",
|
|
27
|
+
"safe_write_h5ad",
|
|
28
|
+
"merge_barcoded_anndatas"
|
|
25
29
|
]
|
smftools/_version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.1.
|
|
1
|
+
__version__ = "0.1.7"
|
smftools/informatics/__init__.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from . import helpers
|
|
2
|
+
from .basecall_pod5s import basecall_pod5s
|
|
2
3
|
from .load_adata import load_adata
|
|
3
4
|
from .subsample_fasta_from_bed import subsample_fasta_from_bed
|
|
4
5
|
from .subsample_pod5 import subsample_pod5
|
|
@@ -6,6 +7,7 @@ from .fast5_to_pod5 import fast5_to_pod5
|
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
__all__ = [
|
|
10
|
+
"basecall_pod5s",
|
|
9
11
|
"load_adata",
|
|
10
12
|
"subsample_fasta_from_bed",
|
|
11
13
|
"subsample_pod5",
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import pysam
|
|
2
|
+
import sys
|
|
3
|
+
|
|
4
|
+
def extract_reads(bam_file_path, num_reads=10):
|
|
5
|
+
# Open the BAM file
|
|
6
|
+
bam_file = pysam.AlignmentFile(bam_file_path, "rb")
|
|
7
|
+
|
|
8
|
+
# Iterate through the first 'num_reads' reads and print the sequences
|
|
9
|
+
count = 0
|
|
10
|
+
for read in bam_file:
|
|
11
|
+
print(f"Read {count + 1}: {read.query_sequence}")
|
|
12
|
+
count += 1
|
|
13
|
+
if count >= num_reads:
|
|
14
|
+
break
|
|
15
|
+
|
|
16
|
+
# Close the BAM file
|
|
17
|
+
bam_file.close()
|
|
18
|
+
|
|
19
|
+
if __name__ == "__main__":
|
|
20
|
+
# Ensure a BAM file path is provided as a command line argument
|
|
21
|
+
if len(sys.argv) < 2:
|
|
22
|
+
print("Usage: python extract_reads.py <path_to_bam_file>")
|
|
23
|
+
sys.exit(1)
|
|
24
|
+
|
|
25
|
+
# Get the BAM file path from command line arguments
|
|
26
|
+
bam_file_path = sys.argv[1]
|
|
27
|
+
|
|
28
|
+
# Call the function to extract the first 10 reads
|
|
29
|
+
extract_reads(bam_file_path)
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# basecall_pod5s
|
|
2
|
+
|
|
3
|
+
def basecall_pod5s(config_path):
|
|
4
|
+
"""
|
|
5
|
+
Basecall from pod5s given a config file.
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
config_path (str): File path to the basecall configuration file
|
|
9
|
+
|
|
10
|
+
Returns:
|
|
11
|
+
None
|
|
12
|
+
"""
|
|
13
|
+
# Lazy importing of packages
|
|
14
|
+
from .helpers import LoadExperimentConfig, make_dirs, canoncall, modcall
|
|
15
|
+
from .fast5_to_pod5 import fast5_to_pod5
|
|
16
|
+
import os
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
# Default params
|
|
20
|
+
bam_suffix = '.bam' # If different, change from here.
|
|
21
|
+
|
|
22
|
+
# Load experiment config parameters into global variables
|
|
23
|
+
experiment_config = LoadExperimentConfig(config_path)
|
|
24
|
+
var_dict = experiment_config.var_dict
|
|
25
|
+
|
|
26
|
+
# These below variables will point to default_value if they are empty in the experiment_config.csv or if the variable is fully omitted from the csv.
|
|
27
|
+
default_value = None
|
|
28
|
+
|
|
29
|
+
# General config variable init
|
|
30
|
+
input_data_path = var_dict.get('input_data_path', default_value) # Path to a directory of POD5s/FAST5s or to a BAM/FASTQ file. Necessary.
|
|
31
|
+
output_directory = var_dict.get('output_directory', default_value) # Path to the output directory to make for the analysis. Necessary.
|
|
32
|
+
model = var_dict.get('model', default_value) # needed for dorado basecaller
|
|
33
|
+
barcode_kit = var_dict.get('barcode_kit', default_value) # needed for dorado basecaller
|
|
34
|
+
barcode_both_ends = var_dict.get('barcode_both_ends', default_value) # dorado demultiplexing
|
|
35
|
+
trim = var_dict.get('trim', default_value) # dorado adapter and barcode removal
|
|
36
|
+
device = var_dict.get('device', 'auto')
|
|
37
|
+
|
|
38
|
+
# Modified basecalling specific variable init
|
|
39
|
+
filter_threshold = var_dict.get('filter_threshold', default_value)
|
|
40
|
+
m6A_threshold = var_dict.get('m6A_threshold', default_value)
|
|
41
|
+
m5C_threshold = var_dict.get('m5C_threshold', default_value)
|
|
42
|
+
hm5C_threshold = var_dict.get('hm5C_threshold', default_value)
|
|
43
|
+
thresholds = [filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold]
|
|
44
|
+
mod_list = var_dict.get('mod_list', default_value)
|
|
45
|
+
|
|
46
|
+
# Make initial output directory
|
|
47
|
+
make_dirs([output_directory])
|
|
48
|
+
os.chdir(output_directory)
|
|
49
|
+
|
|
50
|
+
# Get the input filetype
|
|
51
|
+
if Path(input_data_path).is_file():
|
|
52
|
+
input_data_filetype = '.' + os.path.basename(input_data_path).split('.')[1].lower()
|
|
53
|
+
input_is_pod5 = input_data_filetype in ['.pod5','.p5']
|
|
54
|
+
input_is_fast5 = input_data_filetype in ['.fast5','.f5']
|
|
55
|
+
|
|
56
|
+
elif Path(input_data_path).is_dir():
|
|
57
|
+
# Get the file names in the input data dir
|
|
58
|
+
input_files = os.listdir(input_data_path)
|
|
59
|
+
input_is_pod5 = sum([True for file in input_files if '.pod5' in file or '.p5' in file])
|
|
60
|
+
input_is_fast5 = sum([True for file in input_files if '.fast5' in file or '.f5' in file])
|
|
61
|
+
|
|
62
|
+
# If the input files are not pod5 files, and they are fast5 files, convert the files to a pod5 file before proceeding.
|
|
63
|
+
if input_is_fast5 and not input_is_pod5:
|
|
64
|
+
# take the input directory of fast5 files and write out a single pod5 file into the output directory.
|
|
65
|
+
output_pod5 = os.path.join(output_directory, 'FAST5s_to_POD5.pod5')
|
|
66
|
+
print(f'Input directory contains fast5 files, converting them and concatenating into a single pod5 file in the {output_pod5}')
|
|
67
|
+
fast5_to_pod5(input_data_path, output_pod5)
|
|
68
|
+
# Reassign the pod5_dir variable to point to the new pod5 file.
|
|
69
|
+
input_data_path = output_pod5
|
|
70
|
+
|
|
71
|
+
model_basename = os.path.basename(model)
|
|
72
|
+
model_basename = model_basename.replace('.', '_')
|
|
73
|
+
|
|
74
|
+
if mod_list:
|
|
75
|
+
mod_string = "_".join(mod_list)
|
|
76
|
+
bam=f"{output_directory}/{model_basename}_{mod_string}_calls"
|
|
77
|
+
modcall(model, input_data_path, barcode_kit, mod_list, bam, bam_suffix, barcode_both_ends, trim, device)
|
|
78
|
+
else:
|
|
79
|
+
bam=f"{output_directory}/{model_basename}_canonical_basecalls"
|
|
80
|
+
canoncall(model, input_data_path, barcode_kit, bam, bam_suffix, barcode_both_ends, trim, device)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
## conversion_smf
|
|
2
2
|
|
|
3
|
-
def conversion_smf(fasta, output_directory, conversion_types, strands, model, input_data_path, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix, basecall):
|
|
3
|
+
def conversion_smf(fasta, output_directory, conversion_types, strands, model_dir, model, input_data_path, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix, basecall, barcode_both_ends, trim, device, make_bigwigs, threads, input_already_demuxed):
|
|
4
4
|
"""
|
|
5
5
|
Processes sequencing data from a conversion SMF experiment to an adata object.
|
|
6
6
|
|
|
@@ -9,7 +9,8 @@ def conversion_smf(fasta, output_directory, conversion_types, strands, model, in
|
|
|
9
9
|
output_directory (str): A file path to the directory to output all the analyses.
|
|
10
10
|
conversion_type (list): A list of strings of the conversion types to use in the analysis.
|
|
11
11
|
strands (list): A list of converstion strands to use in the experiment.
|
|
12
|
-
|
|
12
|
+
model_dir (str): a string representing the file path to the dorado basecalling model directory.
|
|
13
|
+
model (str): a string representing the dorado basecalling model.
|
|
13
14
|
input_data_path (str): a string representing the file path to the experiment directory/file containing sequencing data
|
|
14
15
|
split_dir (str): A string representing the file path to the directory to split the BAMs into.
|
|
15
16
|
barcode_kit (str): A string representing the barcoding kit used in the experiment.
|
|
@@ -17,12 +18,21 @@ def conversion_smf(fasta, output_directory, conversion_types, strands, model, in
|
|
|
17
18
|
experiment_name (str): A string to provide an experiment name to the output adata file.
|
|
18
19
|
bam_suffix (str): A suffix to add to the bam file.
|
|
19
20
|
basecall (bool): Whether to go through basecalling or not.
|
|
21
|
+
barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
|
|
22
|
+
trim (bool): Whether to trim barcodes, adapters, and primers from read ends.
|
|
23
|
+
device (str): Device to use for basecalling. auto, metal, cpu, cuda
|
|
24
|
+
make_bigwigs (bool): Whether to make bigwigs
|
|
25
|
+
threads (int): cpu threads available for processing.
|
|
26
|
+
input_already_demuxed (bool): Whether the input files were already demultiplexed
|
|
20
27
|
|
|
21
28
|
Returns:
|
|
22
|
-
|
|
29
|
+
final_adata_path (str): Path to the final adata object
|
|
30
|
+
sorted_output (str): Path to the aligned, sorted BAM
|
|
23
31
|
"""
|
|
24
|
-
from .helpers import align_and_sort_BAM, canoncall,
|
|
32
|
+
from .helpers import align_and_sort_BAM, aligned_BAM_to_bed, canoncall, converted_BAM_to_adata_II, generate_converted_FASTA, get_chromosome_lengths, demux_and_index_BAM, make_dirs, bam_qc, run_multiqc, split_and_index_BAM
|
|
25
33
|
import os
|
|
34
|
+
import glob
|
|
35
|
+
|
|
26
36
|
if basecall:
|
|
27
37
|
model_basename = os.path.basename(model)
|
|
28
38
|
model_basename = model_basename.replace('.', '_')
|
|
@@ -56,7 +66,7 @@ def conversion_smf(fasta, output_directory, conversion_types, strands, model, in
|
|
|
56
66
|
if os.path.exists(canoncall_output):
|
|
57
67
|
print(canoncall_output + ' already exists. Using existing basecalled BAM.')
|
|
58
68
|
else:
|
|
59
|
-
canoncall(model, input_data_path, barcode_kit, bam, bam_suffix)
|
|
69
|
+
canoncall(model_dir, model, input_data_path, barcode_kit, bam, bam_suffix, barcode_both_ends, trim, device)
|
|
60
70
|
else:
|
|
61
71
|
canoncall_output = input_data_path
|
|
62
72
|
|
|
@@ -66,14 +76,57 @@ def conversion_smf(fasta, output_directory, conversion_types, strands, model, in
|
|
|
66
76
|
if os.path.exists(aligned_output) and os.path.exists(sorted_output):
|
|
67
77
|
print(sorted_output + ' already exists. Using existing aligned/sorted BAM.')
|
|
68
78
|
else:
|
|
69
|
-
align_and_sort_BAM(converted_FASTA, canoncall_output, bam_suffix, output_directory)
|
|
79
|
+
align_and_sort_BAM(converted_FASTA, canoncall_output, bam_suffix, output_directory, make_bigwigs, threads)
|
|
80
|
+
|
|
81
|
+
# Make beds and provide basic histograms
|
|
82
|
+
bed_dir = os.path.join(output_directory, 'beds')
|
|
83
|
+
if os.path.isdir(bed_dir):
|
|
84
|
+
print(bed_dir + ' already exists. Skipping BAM -> BED conversion for ' + sorted_output)
|
|
85
|
+
else:
|
|
86
|
+
aligned_BAM_to_bed(aligned_output, output_directory, converted_FASTA, make_bigwigs, threads)
|
|
70
87
|
|
|
71
88
|
### 4) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory###
|
|
89
|
+
if barcode_both_ends:
|
|
90
|
+
split_dir = split_dir + '_both_ends_barcoded'
|
|
91
|
+
else:
|
|
92
|
+
split_dir = split_dir + '_at_least_one_end_barcoded'
|
|
93
|
+
|
|
72
94
|
if os.path.isdir(split_dir):
|
|
73
|
-
print(split_dir + ' already exists. Using existing
|
|
95
|
+
print(split_dir + ' already exists. Using existing demultiplexed BAMs.')
|
|
96
|
+
bam_pattern = '*' + bam_suffix
|
|
97
|
+
bam_files = glob.glob(os.path.join(split_dir, bam_pattern))
|
|
98
|
+
bam_files = [bam for bam in bam_files if '.bai' not in bam and 'unclassified' not in bam]
|
|
99
|
+
bam_files.sort()
|
|
74
100
|
else:
|
|
75
101
|
make_dirs([split_dir])
|
|
76
|
-
|
|
102
|
+
if input_already_demuxed:
|
|
103
|
+
bam_files = split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, output_directory) # custom for non-nanopore
|
|
104
|
+
else:
|
|
105
|
+
bam_files = demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit, barcode_both_ends, trim, fasta, make_bigwigs, threads)
|
|
106
|
+
|
|
107
|
+
# Make beds and provide basic histograms
|
|
108
|
+
bed_dir = os.path.join(split_dir, 'beds')
|
|
109
|
+
if os.path.isdir(bed_dir):
|
|
110
|
+
print(bed_dir + ' already exists. Skipping BAM -> BED conversion for demultiplexed bams')
|
|
111
|
+
else:
|
|
112
|
+
for bam in bam_files:
|
|
113
|
+
aligned_BAM_to_bed(bam, split_dir, converted_FASTA, make_bigwigs, threads)
|
|
114
|
+
|
|
115
|
+
# 5) Samtools QC metrics on split BAM files
|
|
116
|
+
bam_qc_dir = f"{split_dir}/bam_qc"
|
|
117
|
+
if os.path.isdir(bam_qc_dir):
|
|
118
|
+
print(bam_qc_dir + ' already exists. Using existing BAM QC calculations.')
|
|
119
|
+
else:
|
|
120
|
+
make_dirs([bam_qc_dir])
|
|
121
|
+
bam_qc(bam_files, bam_qc_dir, threads, modality='conversion')
|
|
122
|
+
|
|
123
|
+
# multiqc ###
|
|
124
|
+
if os.path.isdir(f"{split_dir}/multiqc"):
|
|
125
|
+
print(f"{split_dir}/multiqc" + ' already exists, skipping multiqc')
|
|
126
|
+
else:
|
|
127
|
+
run_multiqc(split_dir, f"{split_dir}/multiqc")
|
|
128
|
+
|
|
129
|
+
# 6) Take the converted BAM and load it into an adata object.
|
|
130
|
+
final_adata, final_adata_path = converted_BAM_to_adata_II(converted_FASTA, split_dir, mapping_threshold, experiment_name, conversion_types, bam_suffix, device)
|
|
77
131
|
|
|
78
|
-
|
|
79
|
-
converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experiment_name, conversion_types, bam_suffix)
|
|
132
|
+
return final_adata, final_adata_path, sorted_output, bam_files
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
## direct_smf
|
|
2
2
|
|
|
3
|
-
def direct_smf(fasta, output_directory, mod_list, model, thresholds, input_data_path, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix, batch_size, basecall):
|
|
3
|
+
def direct_smf(fasta, output_directory, mod_list, model_dir, model, thresholds, input_data_path, split_dir, barcode_kit, mapping_threshold, experiment_name, bam_suffix, batch_size, basecall, barcode_both_ends, trim, device, make_bigwigs, skip_unclassified, delete_batch_hdfs, threads):
|
|
4
4
|
"""
|
|
5
5
|
Processes sequencing data from a direct methylation detection Nanopore SMF experiment to an AnnData object.
|
|
6
6
|
|
|
@@ -8,7 +8,8 @@ def direct_smf(fasta, output_directory, mod_list, model, thresholds, input_data_
|
|
|
8
8
|
fasta (str): File path to the reference genome to align to.
|
|
9
9
|
output_directory (str): A file path to the directory to output all the analyses.
|
|
10
10
|
mod_list (list): A list of strings of the modification types to use in the analysis.
|
|
11
|
-
|
|
11
|
+
model_dir (str): a string representing the file path to the dorado basecalling model directory.
|
|
12
|
+
model (str): a string representing the dorado basecalling model.
|
|
12
13
|
thresholds (list): A list of floats to pass for call thresholds.
|
|
13
14
|
input_data_path (str): a string representing the file path to the experiment directory containing the input sequencing files.
|
|
14
15
|
split_dir (str): A string representing the file path to the directory to split the BAMs into.
|
|
@@ -18,11 +19,19 @@ def direct_smf(fasta, output_directory, mod_list, model, thresholds, input_data_
|
|
|
18
19
|
bam_suffix (str): A suffix to add to the bam file.
|
|
19
20
|
batch_size (int): An integer number of TSV files to analyze in memory at once while loading the final adata object.
|
|
20
21
|
basecall (bool): Whether to basecall
|
|
22
|
+
barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
|
|
23
|
+
trim (bool): Whether to trim barcodes, adapters, and primers from read ends
|
|
24
|
+
device (str): Device to use for basecalling. auto, metal, cpu, cuda
|
|
25
|
+
make_bigwigs (bool): Whether to make bigwigs
|
|
26
|
+
skip_unclassified (bool): Whether to skip unclassified reads when extracting mods and loading anndata
|
|
27
|
+
delete_batch_hdfs (bool): Whether to delete intermediate hdf5 files.
|
|
28
|
+
threads (int): cpu threads available for processing.
|
|
21
29
|
|
|
22
30
|
Returns:
|
|
23
|
-
|
|
31
|
+
final_adata_path (str): Path to the final adata object
|
|
32
|
+
sorted_output (str): Path to the aligned, sorted BAM
|
|
24
33
|
"""
|
|
25
|
-
from .helpers import align_and_sort_BAM, extract_mods, get_chromosome_lengths, make_modbed, modcall, modkit_extract_to_adata, modQC,
|
|
34
|
+
from .helpers import align_and_sort_BAM, aligned_BAM_to_bed, extract_mods, get_chromosome_lengths, make_modbed, modcall, modkit_extract_to_adata, modQC, demux_and_index_BAM, make_dirs, bam_qc, run_multiqc
|
|
26
35
|
import os
|
|
27
36
|
|
|
28
37
|
if basecall:
|
|
@@ -35,8 +44,15 @@ def direct_smf(fasta, output_directory, mod_list, model, thresholds, input_data_
|
|
|
35
44
|
bam=os.path.join(output_directory, bam_base)
|
|
36
45
|
aligned_BAM=f"{bam}_aligned"
|
|
37
46
|
aligned_sorted_BAM=f"{aligned_BAM}_sorted"
|
|
38
|
-
|
|
39
|
-
|
|
47
|
+
|
|
48
|
+
if barcode_both_ends:
|
|
49
|
+
split_dir = split_dir + '_both_ends_barcoded'
|
|
50
|
+
else:
|
|
51
|
+
split_dir = split_dir + '_at_least_one_end_barcoded'
|
|
52
|
+
|
|
53
|
+
mod_bed_dir=f"{split_dir}/split_mod_beds"
|
|
54
|
+
mod_tsv_dir=f"{split_dir}/split_mod_tsvs"
|
|
55
|
+
bam_qc_dir = f"{split_dir}/bam_qc"
|
|
40
56
|
|
|
41
57
|
aligned_sorted_output = aligned_sorted_BAM + bam_suffix
|
|
42
58
|
mod_map = {'6mA': '6mA', '5mC_5hmC': '5mC'}
|
|
@@ -53,7 +69,7 @@ def direct_smf(fasta, output_directory, mod_list, model, thresholds, input_data_
|
|
|
53
69
|
if os.path.exists(modcall_output):
|
|
54
70
|
print(modcall_output + ' already exists. Using existing basecalled BAM.')
|
|
55
71
|
else:
|
|
56
|
-
modcall(model, input_data_path, barcode_kit, mod_list, bam, bam_suffix)
|
|
72
|
+
modcall(model_dir, model, input_data_path, barcode_kit, mod_list, bam, bam_suffix, barcode_both_ends, trim, device)
|
|
57
73
|
else:
|
|
58
74
|
modcall_output = input_data_path
|
|
59
75
|
|
|
@@ -63,27 +79,59 @@ def direct_smf(fasta, output_directory, mod_list, model, thresholds, input_data_
|
|
|
63
79
|
if os.path.exists(aligned_output) and os.path.exists(sorted_output):
|
|
64
80
|
print(sorted_output + ' already exists. Using existing aligned/sorted BAM.')
|
|
65
81
|
else:
|
|
66
|
-
align_and_sort_BAM(fasta, modcall_output, bam_suffix, output_directory)
|
|
82
|
+
align_and_sort_BAM(fasta, modcall_output, bam_suffix, output_directory, make_bigwigs, threads)
|
|
83
|
+
|
|
84
|
+
# Make beds and provide basic histograms
|
|
85
|
+
bed_dir = os.path.join(output_directory, 'beds')
|
|
86
|
+
if os.path.isdir(bed_dir):
|
|
87
|
+
print(bed_dir + ' already exists. Skipping BAM -> BED conversion for ' + sorted_output)
|
|
88
|
+
else:
|
|
89
|
+
aligned_BAM_to_bed(aligned_output, output_directory, fasta, make_bigwigs, threads)
|
|
67
90
|
|
|
68
91
|
# 3) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory
|
|
69
92
|
if os.path.isdir(split_dir):
|
|
70
|
-
print(split_dir + ' already exists. Using existing
|
|
93
|
+
print(split_dir + ' already exists. Using existing demultiplexed BAMs.')
|
|
94
|
+
bam_files = os.listdir(split_dir)
|
|
95
|
+
bam_files = [os.path.join(split_dir, file) for file in bam_files if '.bam' in file and '.bai' not in file and 'unclassified' not in file]
|
|
96
|
+
bam_files.sort()
|
|
71
97
|
else:
|
|
72
98
|
make_dirs([split_dir])
|
|
73
|
-
|
|
99
|
+
bam_files = demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit, barcode_both_ends, trim, fasta, make_bigwigs, threads)
|
|
100
|
+
# split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, output_directory, converted_FASTA) # deprecated, just use dorado demux
|
|
101
|
+
|
|
102
|
+
# Make beds and provide basic histograms
|
|
103
|
+
bed_dir = os.path.join(split_dir, 'beds')
|
|
104
|
+
if os.path.isdir(bed_dir):
|
|
105
|
+
print(bed_dir + ' already exists. Skipping BAM -> BED conversion for demultiplexed bams')
|
|
106
|
+
else:
|
|
107
|
+
for bam in bam_files:
|
|
108
|
+
aligned_BAM_to_bed(bam, split_dir, fasta, make_bigwigs, threads)
|
|
109
|
+
|
|
110
|
+
# 4) Samtools QC metrics on split BAM files
|
|
111
|
+
if os.path.isdir(bam_qc_dir):
|
|
112
|
+
print(bam_qc_dir + ' already exists. Using existing BAM QC calculations.')
|
|
113
|
+
else:
|
|
114
|
+
make_dirs([bam_qc_dir])
|
|
115
|
+
bam_qc(bam_files, bam_qc_dir, threads, modality='direct')
|
|
74
116
|
|
|
75
|
-
#
|
|
117
|
+
# 5) Using nanopore modkit to work with modified BAM files ###
|
|
76
118
|
if os.path.isdir(mod_bed_dir):
|
|
77
|
-
print(mod_bed_dir + ' already exists')
|
|
119
|
+
print(mod_bed_dir + ' already exists, skipping making modbeds')
|
|
78
120
|
else:
|
|
79
121
|
make_dirs([mod_bed_dir])
|
|
80
122
|
modQC(aligned_sorted_output, thresholds) # get QC metrics for mod calls
|
|
81
123
|
make_modbed(aligned_sorted_output, thresholds, mod_bed_dir) # Generate bed files of position methylation summaries for every sample
|
|
82
|
-
|
|
83
|
-
|
|
124
|
+
|
|
125
|
+
# multiqc ###
|
|
126
|
+
if os.path.isdir(f"{split_dir}/multiqc"):
|
|
127
|
+
print(f"{split_dir}/multiqc" + ' already exists, skipping multiqc')
|
|
84
128
|
else:
|
|
85
|
-
|
|
86
|
-
|
|
129
|
+
run_multiqc(split_dir, f"{split_dir}/multiqc")
|
|
130
|
+
|
|
131
|
+
make_dirs([mod_tsv_dir])
|
|
132
|
+
extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassified, threads) # Extract methylations calls for split BAM files into split TSV files
|
|
133
|
+
|
|
134
|
+
#6 Load the modification data from TSVs into an adata object
|
|
135
|
+
final_adata, final_adata_path = modkit_extract_to_adata(fasta, split_dir, mapping_threshold, experiment_name, mods, batch_size, mod_tsv_dir, delete_batch_hdfs, threads)
|
|
87
136
|
|
|
88
|
-
|
|
89
|
-
modkit_extract_to_adata(fasta, split_dir, mapping_threshold, experiment_name, mods, batch_size, mod_tsv_dir)
|
|
137
|
+
return final_adata, final_adata_path, sorted_output, bam_files
|
|
@@ -42,6 +42,7 @@ class LoadExperimentConfig:
|
|
|
42
42
|
"""
|
|
43
43
|
def __init__(self, experiment_config):
|
|
44
44
|
import pandas as pd
|
|
45
|
+
print(f"Loading experiment config from {experiment_config}")
|
|
45
46
|
# Read the CSV into a pandas DataFrame
|
|
46
47
|
df = pd.read_csv(experiment_config)
|
|
47
48
|
# Initialize an empty dictionary to store variables
|
|
@@ -1,14 +1,18 @@
|
|
|
1
1
|
from .align_and_sort_BAM import align_and_sort_BAM
|
|
2
2
|
from .aligned_BAM_to_bed import aligned_BAM_to_bed
|
|
3
|
+
from .bam_qc import bam_qc
|
|
3
4
|
from .bed_to_bigwig import bed_to_bigwig
|
|
4
5
|
from .binarize_converted_base_identities import binarize_converted_base_identities
|
|
5
6
|
from .canoncall import canoncall
|
|
6
7
|
from .complement_base_list import complement_base_list
|
|
7
|
-
from .
|
|
8
|
+
from .converted_BAM_to_adata_II import converted_BAM_to_adata_II
|
|
8
9
|
from .concatenate_fastqs_to_bam import concatenate_fastqs_to_bam
|
|
9
10
|
from .count_aligned_reads import count_aligned_reads
|
|
11
|
+
from .demux_and_index_BAM import demux_and_index_BAM
|
|
10
12
|
from .extract_base_identities import extract_base_identities
|
|
11
13
|
from .extract_mods import extract_mods
|
|
14
|
+
from .extract_read_features_from_bam import extract_read_features_from_bam
|
|
15
|
+
from .extract_read_lengths_from_bed import extract_read_lengths_from_bed
|
|
12
16
|
from .extract_readnames_from_BAM import extract_readnames_from_BAM
|
|
13
17
|
from .find_conversion_sites import find_conversion_sites
|
|
14
18
|
from .generate_converted_FASTA import convert_FASTA_record, generate_converted_FASTA
|
|
@@ -23,22 +27,29 @@ from .modkit_extract_to_adata import modkit_extract_to_adata
|
|
|
23
27
|
from .modQC import modQC
|
|
24
28
|
from .one_hot_encode import one_hot_encode
|
|
25
29
|
from .ohe_batching import ohe_batching
|
|
30
|
+
from .one_hot_decode import one_hot_decode
|
|
31
|
+
from .ohe_layers_decode import ohe_layers_decode
|
|
26
32
|
from .plot_read_length_and_coverage_histograms import plot_read_length_and_coverage_histograms
|
|
33
|
+
from .run_multiqc import run_multiqc
|
|
27
34
|
from .separate_bam_by_bc import separate_bam_by_bc
|
|
28
35
|
from .split_and_index_BAM import split_and_index_BAM
|
|
29
36
|
|
|
30
37
|
__all__ = [
|
|
31
38
|
"align_and_sort_BAM",
|
|
32
39
|
"aligned_BAM_to_bed",
|
|
40
|
+
"bam_qc",
|
|
33
41
|
"bed_to_bigwig",
|
|
34
42
|
"binarize_converted_base_identities",
|
|
35
43
|
"canoncall",
|
|
36
44
|
"complement_base_list",
|
|
37
|
-
"
|
|
45
|
+
"converted_BAM_to_adata_II",
|
|
38
46
|
"concatenate_fastqs_to_bam",
|
|
39
47
|
"count_aligned_reads",
|
|
48
|
+
"demux_and_index_BAM",
|
|
40
49
|
"extract_base_identities",
|
|
41
50
|
"extract_mods",
|
|
51
|
+
"extract_read_features_from_bam",
|
|
52
|
+
"extract_read_lengths_from_bed",
|
|
42
53
|
"extract_readnames_from_BAM",
|
|
43
54
|
"find_conversion_sites",
|
|
44
55
|
"convert_FASTA_record",
|
|
@@ -54,7 +65,10 @@ __all__ = [
|
|
|
54
65
|
"modQC",
|
|
55
66
|
"one_hot_encode",
|
|
56
67
|
"ohe_batching",
|
|
68
|
+
"one_hot_decode",
|
|
69
|
+
"ohe_layers_decode",
|
|
57
70
|
"plot_read_length_and_coverage_histograms",
|
|
71
|
+
"run_multiqc",
|
|
58
72
|
"separate_bam_by_bc",
|
|
59
73
|
"split_and_index_BAM"
|
|
60
74
|
]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
## align_and_sort_BAM
|
|
2
2
|
|
|
3
|
-
def align_and_sort_BAM(fasta, input, bam_suffix, output_directory):
|
|
3
|
+
def align_and_sort_BAM(fasta, input, bam_suffix='.bam', output_directory='aligned_outputs', make_bigwigs=False, threads=None):
|
|
4
4
|
"""
|
|
5
5
|
A wrapper for running dorado aligner and samtools functions
|
|
6
6
|
|
|
@@ -9,6 +9,8 @@ def align_and_sort_BAM(fasta, input, bam_suffix, output_directory):
|
|
|
9
9
|
input (str): File path to the basecalled file to align. Works for .bam and .fastq files
|
|
10
10
|
bam_suffix (str): The suffix to use for the BAM file.
|
|
11
11
|
output_directory (str): A file path to the directory to output all the analyses.
|
|
12
|
+
make_bigwigs (bool): Whether to make bigwigs
|
|
13
|
+
threads (int): Number of additional threads to use
|
|
12
14
|
|
|
13
15
|
Returns:
|
|
14
16
|
None
|
|
@@ -16,9 +18,7 @@ def align_and_sort_BAM(fasta, input, bam_suffix, output_directory):
|
|
|
16
18
|
"""
|
|
17
19
|
import subprocess
|
|
18
20
|
import os
|
|
19
|
-
|
|
20
|
-
from .extract_readnames_from_BAM import extract_readnames_from_BAM
|
|
21
|
-
from .make_dirs import make_dirs
|
|
21
|
+
|
|
22
22
|
input_basename = os.path.basename(input)
|
|
23
23
|
input_suffix = '.' + input_basename.split('.')[1]
|
|
24
24
|
|
|
@@ -28,21 +28,32 @@ def align_and_sort_BAM(fasta, input, bam_suffix, output_directory):
|
|
|
28
28
|
aligned_sorted_BAM=f"{aligned_BAM}_sorted"
|
|
29
29
|
aligned_output = aligned_BAM + bam_suffix
|
|
30
30
|
aligned_sorted_output = aligned_sorted_BAM + bam_suffix
|
|
31
|
+
|
|
32
|
+
if threads:
|
|
33
|
+
threads = str(threads)
|
|
34
|
+
else:
|
|
35
|
+
pass
|
|
31
36
|
|
|
32
37
|
# Run dorado aligner
|
|
33
|
-
|
|
38
|
+
print(f"Aligning BAM to Reference: {input}")
|
|
39
|
+
if threads:
|
|
40
|
+
alignment_command = ["dorado", "aligner", "-t", threads, '--mm2-opts', "-N 1", fasta, input]
|
|
41
|
+
else:
|
|
42
|
+
alignment_command = ["dorado", "aligner", '--mm2-opts', "-N 1", fasta, input]
|
|
43
|
+
subprocess.run(alignment_command, stdout=open(aligned_output, "w"))
|
|
34
44
|
|
|
35
45
|
# Sort the BAM on positional coordinates
|
|
36
|
-
|
|
46
|
+
print(f"Sorting BAM: {aligned_output}")
|
|
47
|
+
if threads:
|
|
48
|
+
sort_command = ["samtools", "sort", "-@", threads, "-o", aligned_sorted_output, aligned_output]
|
|
49
|
+
else:
|
|
50
|
+
sort_command = ["samtools", "sort", "-o", aligned_sorted_output, aligned_output]
|
|
51
|
+
subprocess.run(sort_command)
|
|
37
52
|
|
|
38
53
|
# Create a BAM index file
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
aligned_BAM_to_bed(aligned_sorted_output, plotting_dir, bed_dir, fasta)
|
|
46
|
-
|
|
47
|
-
# Make a text file of reads for the BAM
|
|
48
|
-
extract_readnames_from_BAM(aligned_sorted_output)
|
|
54
|
+
print(f"Indexing BAM: {aligned_sorted_output}")
|
|
55
|
+
if threads:
|
|
56
|
+
index_command = ["samtools", "index", "-@", threads, aligned_sorted_output]
|
|
57
|
+
else:
|
|
58
|
+
index_command = ["samtools", "index", aligned_sorted_output]
|
|
59
|
+
subprocess.run(index_command)
|