smftools 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/_settings.py +3 -2
- smftools/_version.py +1 -1
- smftools/datasets/F1_sample_sheet.csv +5 -0
- smftools/datasets/datasets.py +8 -7
- smftools/informatics/__init__.py +7 -5
- smftools/informatics/{bam_conversion.py → archived/bam_conversion.py} +16 -4
- smftools/informatics/{bam_direct.py → archived/bam_direct.py} +22 -8
- smftools/informatics/archived/basecalls_to_adata.py +71 -0
- smftools/informatics/conversion_smf.py +79 -0
- smftools/informatics/direct_smf.py +89 -0
- smftools/informatics/fast5_to_pod5.py +8 -6
- smftools/informatics/helpers/__init__.py +18 -0
- smftools/informatics/helpers/align_and_sort_BAM.py +9 -13
- smftools/informatics/helpers/aligned_BAM_to_bed.py +73 -0
- smftools/informatics/helpers/bed_to_bigwig.py +39 -0
- smftools/informatics/helpers/binarize_converted_base_identities.py +2 -2
- smftools/informatics/helpers/canoncall.py +2 -0
- smftools/informatics/helpers/complement_base_list.py +21 -0
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +54 -0
- smftools/informatics/helpers/converted_BAM_to_adata.py +161 -92
- smftools/informatics/helpers/count_aligned_reads.py +13 -9
- smftools/informatics/helpers/extract_base_identities.py +34 -20
- smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
- smftools/informatics/helpers/find_conversion_sites.py +11 -9
- smftools/informatics/helpers/generate_converted_FASTA.py +33 -14
- smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
- smftools/informatics/helpers/index_fasta.py +12 -0
- smftools/informatics/helpers/modcall.py +3 -1
- smftools/informatics/helpers/modkit_extract_to_adata.py +467 -316
- smftools/informatics/helpers/ohe_batching.py +52 -0
- smftools/informatics/helpers/one_hot_encode.py +10 -8
- smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +52 -0
- smftools/informatics/helpers/separate_bam_by_bc.py +4 -2
- smftools/informatics/helpers/split_and_index_BAM.py +16 -4
- smftools/informatics/load_adata.py +127 -0
- smftools/informatics/subsample_fasta_from_bed.py +47 -0
- smftools/informatics/subsample_pod5.py +69 -13
- smftools/preprocessing/__init__.py +6 -1
- smftools/preprocessing/append_C_context.py +37 -14
- smftools/preprocessing/calculate_complexity.py +2 -2
- smftools/preprocessing/calculate_consensus.py +47 -0
- smftools/preprocessing/calculate_converted_read_methylation_stats.py +60 -9
- smftools/preprocessing/calculate_coverage.py +2 -2
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +1 -1
- smftools/preprocessing/calculate_read_length_stats.py +56 -2
- smftools/preprocessing/clean_NaN.py +2 -2
- smftools/preprocessing/filter_converted_reads_on_methylation.py +4 -2
- smftools/preprocessing/filter_reads_on_length.py +4 -2
- smftools/preprocessing/invert_adata.py +1 -0
- smftools/preprocessing/load_sample_sheet.py +24 -0
- smftools/preprocessing/make_dirs.py +21 -0
- smftools/preprocessing/mark_duplicates.py +34 -19
- smftools/preprocessing/recipes.py +125 -0
- smftools/preprocessing/remove_duplicates.py +7 -4
- smftools/tools/apply_HMM.py +1 -0
- smftools/tools/cluster.py +0 -0
- smftools/tools/read_HMM.py +1 -0
- smftools/tools/subset_adata.py +32 -0
- smftools/tools/train_HMM.py +43 -0
- {smftools-0.1.1.dist-info → smftools-0.1.3.dist-info}/METADATA +13 -7
- smftools-0.1.3.dist-info/RECORD +84 -0
- smftools/informatics/basecalls_to_adata.py +0 -42
- smftools/informatics/pod5_conversion.py +0 -53
- smftools/informatics/pod5_direct.py +0 -55
- smftools/informatics/pod5_to_adata.py +0 -40
- smftools-0.1.1.dist-info/RECORD +0 -64
- {smftools-0.1.1.dist-info → smftools-0.1.3.dist-info}/WHEEL +0 -0
- {smftools-0.1.1.dist-info → smftools-0.1.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# ohe_batching
|
|
2
|
+
|
|
3
|
+
def ohe_batching(base_identities, tmp_dir, record, prefix='', batch_size=100000):
|
|
4
|
+
"""
|
|
5
|
+
Processes base identities to one-hot encoded matrices and writes to a h5ad file in batches.
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
base_identities (dict): A dictionary of read names and sequences.
|
|
9
|
+
tmp_dir (str): Path to directory where the files will be saved.
|
|
10
|
+
record (str): Name of the record.
|
|
11
|
+
prefix (str): Prefix to add to the output file name
|
|
12
|
+
batch_size (int): Number of reads to process in each batch.
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
ohe_file (list): list of output file names
|
|
16
|
+
"""
|
|
17
|
+
import os
|
|
18
|
+
import anndata as ad
|
|
19
|
+
import numpy as np
|
|
20
|
+
from tqdm import tqdm
|
|
21
|
+
from .one_hot_encode import one_hot_encode
|
|
22
|
+
|
|
23
|
+
batch = {}
|
|
24
|
+
count = 0
|
|
25
|
+
batch_number = 0
|
|
26
|
+
total_reads = len(base_identities)
|
|
27
|
+
file_names = []
|
|
28
|
+
|
|
29
|
+
for read_name, seq in tqdm(base_identities.items(), desc="Encoding and writing one hot encoded reads", total=total_reads):
|
|
30
|
+
one_hot_matrix = one_hot_encode(seq)
|
|
31
|
+
batch[read_name] = one_hot_matrix
|
|
32
|
+
count += 1
|
|
33
|
+
# If the batch size is reached, write out the batch and reset
|
|
34
|
+
if count >= batch_size:
|
|
35
|
+
save_name = os.path.join(tmp_dir, f'tmp_{prefix}_{record}_{batch_number}.h5ad.gz')
|
|
36
|
+
X = np.random.rand(1, 1)
|
|
37
|
+
tmp_ad = ad.AnnData(X=X, uns=batch)
|
|
38
|
+
tmp_ad.write_h5ad(save_name, compression='gzip')
|
|
39
|
+
file_names.append(save_name)
|
|
40
|
+
batch.clear()
|
|
41
|
+
count = 0
|
|
42
|
+
batch_number += 1
|
|
43
|
+
|
|
44
|
+
# Write out any remaining reads in the final batch
|
|
45
|
+
if batch:
|
|
46
|
+
save_name = os.path.join(tmp_dir, f'tmp_{prefix}_{record}_{batch_number}.h5ad.gz')
|
|
47
|
+
X = np.random.rand(1, 1)
|
|
48
|
+
tmp_ad = ad.AnnData(X=X, uns=batch)
|
|
49
|
+
tmp_ad.write_h5ad(save_name, compression='gzip')
|
|
50
|
+
file_names.append(save_name)
|
|
51
|
+
|
|
52
|
+
return file_names
|
|
@@ -3,17 +3,19 @@
|
|
|
3
3
|
# String encodings
|
|
4
4
|
def one_hot_encode(sequence):
|
|
5
5
|
"""
|
|
6
|
-
One hot encodes a sequence
|
|
6
|
+
One hot encodes a sequence list.
|
|
7
7
|
Parameters:
|
|
8
|
-
sequence (
|
|
8
|
+
sequence (list): A list of DNA base sequences.
|
|
9
9
|
|
|
10
10
|
Returns:
|
|
11
|
-
|
|
11
|
+
flattened (ndarray): A numpy ndarray holding a flattened one hot encoding of the input sequence string.
|
|
12
12
|
"""
|
|
13
13
|
import numpy as np
|
|
14
14
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
15
|
+
seq_array = np.array(sequence, dtype='<U1') # String dtype
|
|
16
|
+
mapping = np.array(['A', 'C', 'G', 'T', 'N'])
|
|
17
|
+
seq_array[~np.isin(seq_array, mapping)] = 'N'
|
|
18
|
+
one_hot_matrix = (seq_array[:, None] == mapping).astype(int)
|
|
19
|
+
flattened = one_hot_matrix.flatten()
|
|
20
|
+
|
|
21
|
+
return flattened
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# plot_read_length_and_coverage_histograms
|
|
2
|
+
|
|
3
|
+
def plot_read_length_and_coverage_histograms(bed_file, plotting_directory):
|
|
4
|
+
"""
|
|
5
|
+
Plots read length and coverage statistics for each record.
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
bed_file (str): Path to the bed file to derive read lengths and coverage from.
|
|
9
|
+
plot_directory (str): Path to the directory to write out historgrams.
|
|
10
|
+
|
|
11
|
+
Returns:
|
|
12
|
+
None
|
|
13
|
+
"""
|
|
14
|
+
import pandas as pd
|
|
15
|
+
import matplotlib.pyplot as plt
|
|
16
|
+
import numpy as np
|
|
17
|
+
import os
|
|
18
|
+
|
|
19
|
+
bed_basename = os.path.basename(bed_file).split('.bed')[0]
|
|
20
|
+
# Load the BED file into a DataFrame
|
|
21
|
+
df = pd.read_csv(bed_file, sep='\t', header=None, names=['chromosome', 'start', 'end', 'length', 'read_name'])
|
|
22
|
+
|
|
23
|
+
# Group by chromosome
|
|
24
|
+
grouped = df.groupby('chromosome')
|
|
25
|
+
|
|
26
|
+
for chrom, group in grouped:
|
|
27
|
+
# Plot read length histogram
|
|
28
|
+
plt.figure(figsize=(12, 6))
|
|
29
|
+
plt.hist(group['length'], bins=50, edgecolor='k', alpha=0.7)
|
|
30
|
+
plt.title(f'Read Length Histogram of reads aligned to {chrom}')
|
|
31
|
+
plt.xlabel('Read Length')
|
|
32
|
+
plt.ylabel('Count')
|
|
33
|
+
plt.grid(True)
|
|
34
|
+
save_name = os.path.join(plotting_directory, f'{bed_basename}_{chrom}_read_length_histogram.png')
|
|
35
|
+
plt.savefig(save_name)
|
|
36
|
+
plt.close()
|
|
37
|
+
|
|
38
|
+
# Compute coverage
|
|
39
|
+
coverage = np.zeros(group['end'].max())
|
|
40
|
+
for _, row in group.iterrows():
|
|
41
|
+
coverage[row['start']:row['end']] += 1
|
|
42
|
+
|
|
43
|
+
# Plot coverage histogram
|
|
44
|
+
plt.figure(figsize=(12, 6))
|
|
45
|
+
plt.plot(coverage, color='b')
|
|
46
|
+
plt.title(f'Coverage Histogram for {chrom}')
|
|
47
|
+
plt.xlabel('Position')
|
|
48
|
+
plt.ylabel('Coverage')
|
|
49
|
+
plt.grid(True)
|
|
50
|
+
save_name = os.path.join(plotting_directory, f'{bed_basename}_{chrom}_coverage_histogram.png')
|
|
51
|
+
plt.savefig(save_name)
|
|
52
|
+
plt.close()
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
## separate_bam_by_bc
|
|
2
2
|
|
|
3
3
|
# General
|
|
4
|
-
def separate_bam_by_bc(input_bam, output_prefix, bam_suffix):
|
|
4
|
+
def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
|
|
5
5
|
"""
|
|
6
6
|
Separates an input BAM file on the BC SAM tag values.
|
|
7
7
|
|
|
@@ -9,6 +9,7 @@ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix):
|
|
|
9
9
|
input_bam (str): File path to the BAM file to split.
|
|
10
10
|
output_prefix (str): A prefix to append to the output BAM.
|
|
11
11
|
bam_suffix (str): A suffix to add to the bam file.
|
|
12
|
+
split_dir (str): String indicating path to directory to split BAMs into
|
|
12
13
|
|
|
13
14
|
Returns:
|
|
14
15
|
None
|
|
@@ -31,7 +32,8 @@ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix):
|
|
|
31
32
|
bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
|
|
32
33
|
# Open the output BAM file corresponding to the barcode
|
|
33
34
|
if bc_tag not in output_files:
|
|
34
|
-
|
|
35
|
+
output_path = os.path.join(split_dir, f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}")
|
|
36
|
+
output_files[bc_tag] = pysam.AlignmentFile(output_path, "wb", header=bam.header)
|
|
35
37
|
# Write the read to the corresponding output BAM file
|
|
36
38
|
output_files[bc_tag].write(read)
|
|
37
39
|
except KeyError:
|
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
## split_and_index_BAM
|
|
2
2
|
|
|
3
|
-
def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
|
|
3
|
+
def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, output_directory, fasta):
|
|
4
4
|
"""
|
|
5
5
|
A wrapper function for splitting BAMS and indexing them.
|
|
6
6
|
Parameters:
|
|
7
7
|
aligned_sorted_BAM (str): A string representing the file path of the aligned_sorted BAM file.
|
|
8
8
|
split_dir (str): A string representing the file path to the directory to split the BAMs into.
|
|
9
9
|
bam_suffix (str): A suffix to add to the bam file.
|
|
10
|
+
output_directory (str): A file path to the directory to output all the analyses.
|
|
11
|
+
fasta (str): File path to the reference genome to align to.
|
|
10
12
|
|
|
11
13
|
Returns:
|
|
12
14
|
None
|
|
@@ -17,13 +19,23 @@ def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
|
|
|
17
19
|
import subprocess
|
|
18
20
|
import glob
|
|
19
21
|
from .separate_bam_by_bc import separate_bam_by_bc
|
|
22
|
+
from .aligned_BAM_to_bed import aligned_BAM_to_bed
|
|
23
|
+
from .extract_readnames_from_BAM import extract_readnames_from_BAM
|
|
24
|
+
from .make_dirs import make_dirs
|
|
20
25
|
|
|
21
|
-
os.
|
|
26
|
+
plotting_dir = os.path.join(output_directory, 'demultiplexed_bed_histograms')
|
|
27
|
+
bed_dir = os.path.join(output_directory, 'demultiplexed_read_alignment_coordinates')
|
|
28
|
+
make_dirs([plotting_dir, bed_dir])
|
|
22
29
|
aligned_sorted_output = aligned_sorted_BAM + bam_suffix
|
|
23
30
|
file_prefix = readwrite.date_string()
|
|
24
|
-
separate_bam_by_bc(aligned_sorted_output, file_prefix, bam_suffix)
|
|
31
|
+
separate_bam_by_bc(aligned_sorted_output, file_prefix, bam_suffix, split_dir)
|
|
25
32
|
# Make a BAM index file for the BAMs in that directory
|
|
26
33
|
bam_pattern = '*' + bam_suffix
|
|
27
34
|
bam_files = glob.glob(os.path.join(split_dir, bam_pattern))
|
|
35
|
+
bam_files = [bam for bam in bam_files if '.bai' not in bam]
|
|
28
36
|
for input_file in bam_files:
|
|
29
|
-
subprocess.run(["samtools", "index", input_file])
|
|
37
|
+
subprocess.run(["samtools", "index", input_file])
|
|
38
|
+
# Make a bed file of coordinates for the BAM
|
|
39
|
+
aligned_BAM_to_bed(input_file, plotting_dir, bed_dir, fasta)
|
|
40
|
+
# Make a text file of reads for the BAM
|
|
41
|
+
extract_readnames_from_BAM(input_file)
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
## load_adata
|
|
2
|
+
|
|
3
|
+
def load_adata(config_path):
|
|
4
|
+
"""
|
|
5
|
+
High-level function to call for converting raw sequencing data to an adata object.
|
|
6
|
+
Works for nanopore pod5, fast5, and unaligned modBAM data types for direct SMF workflows.
|
|
7
|
+
Works for nanopore pod5, fast5, unaligned BAM for conversion SMF workflows.
|
|
8
|
+
Also works for illumina fastq and unaligned BAM for conversion SMF workflows.
|
|
9
|
+
|
|
10
|
+
Parameters:
|
|
11
|
+
config_path (str): A string representing the file path to the experiment configuration csv file.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
None
|
|
15
|
+
"""
|
|
16
|
+
# Lazy importing of packages
|
|
17
|
+
from .helpers import LoadExperimentConfig, make_dirs, concatenate_fastqs_to_bam
|
|
18
|
+
from .fast5_to_pod5 import fast5_to_pod5
|
|
19
|
+
from .subsample_fasta_from_bed import subsample_fasta_from_bed
|
|
20
|
+
import os
|
|
21
|
+
import numpy as np
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
|
|
24
|
+
# Default params
|
|
25
|
+
bam_suffix = '.bam' # If different, change from here.
|
|
26
|
+
split_dir = 'demultiplexed_BAMs' # If different, change from here.
|
|
27
|
+
strands = ['bottom', 'top'] # If different, change from here. Having both listed generally doesn't slow things down too much.
|
|
28
|
+
conversions = ['unconverted'] # The name to use for the unconverted files. If different, change from here.
|
|
29
|
+
|
|
30
|
+
# Load experiment config parameters into global variables
|
|
31
|
+
experiment_config = LoadExperimentConfig(config_path)
|
|
32
|
+
var_dict = experiment_config.var_dict
|
|
33
|
+
|
|
34
|
+
# These below variables will point to default_value if they are empty in the experiment_config.csv or if the variable is fully omitted from the csv.
|
|
35
|
+
default_value = None
|
|
36
|
+
|
|
37
|
+
# General config variable init
|
|
38
|
+
smf_modality = var_dict.get('smf_modality', default_value) # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Necessary.
|
|
39
|
+
input_data_path = var_dict.get('input_data_path', default_value) # Path to a directory of POD5s/FAST5s or to a BAM/FASTQ file. Necessary.
|
|
40
|
+
output_directory = var_dict.get('output_directory', default_value) # Path to the output directory to make for the analysis. Necessary.
|
|
41
|
+
fasta = var_dict.get('fasta', default_value) # Path to reference FASTA.
|
|
42
|
+
fasta_regions_of_interest = var_dict.get("fasta_regions_of_interest", default_value) # Path to a bed file listing coordinate regions of interest within the FASTA to include. Optional.
|
|
43
|
+
mapping_threshold = var_dict.get('mapping_threshold', default_value) # Minimum proportion of mapped reads that need to fall within a region to include in the final AnnData.
|
|
44
|
+
experiment_name = var_dict.get('experiment_name', default_value) # A key term to add to the AnnData file name.
|
|
45
|
+
model = var_dict.get('model', default_value) # needed for dorado basecaller
|
|
46
|
+
barcode_kit = var_dict.get('barcode_kit', default_value) # needed for dorado basecaller
|
|
47
|
+
# Conversion specific variable init
|
|
48
|
+
conversion_types = var_dict.get('conversion_types', default_value)
|
|
49
|
+
# Direct methylation specific variable init
|
|
50
|
+
filter_threshold = var_dict.get('filter_threshold', default_value)
|
|
51
|
+
m6A_threshold = var_dict.get('m6A_threshold', default_value)
|
|
52
|
+
m5C_threshold = var_dict.get('m5C_threshold', default_value)
|
|
53
|
+
hm5C_threshold = var_dict.get('hm5C_threshold', default_value)
|
|
54
|
+
thresholds = [filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold]
|
|
55
|
+
mod_list = var_dict.get('mod_list', default_value)
|
|
56
|
+
batch_size = var_dict.get('batch_size', default_value)
|
|
57
|
+
|
|
58
|
+
# Make initial output directory
|
|
59
|
+
make_dirs([output_directory])
|
|
60
|
+
os.chdir(output_directory)
|
|
61
|
+
# Define the pathname to split BAMs into later during demultiplexing.
|
|
62
|
+
split_path = os.path.join(output_directory, split_dir)
|
|
63
|
+
|
|
64
|
+
# If fasta_regions_of_interest is passed, subsample the input FASTA on regions of interest and use the subsampled FASTA.
|
|
65
|
+
if fasta_regions_of_interest and '.bed' in fasta_regions_of_interest:
|
|
66
|
+
fasta_basename = os.path.basename(fasta).split('.fa')[0]
|
|
67
|
+
bed_basename_minus_suffix = os.path.basename(fasta_regions_of_interest).split('.bed')[0]
|
|
68
|
+
output_FASTA = fasta_basename + '_subsampled_by_' + bed_basename_minus_suffix + '.fasta'
|
|
69
|
+
subsample_fasta_from_bed(fasta, fasta_regions_of_interest, output_directory, output_FASTA)
|
|
70
|
+
fasta = os.path.join(output_directory, output_FASTA)
|
|
71
|
+
|
|
72
|
+
# If conversion_types is passed:
|
|
73
|
+
if conversion_types:
|
|
74
|
+
conversions += conversion_types
|
|
75
|
+
|
|
76
|
+
# Get the input filetype
|
|
77
|
+
if Path(input_data_path).is_file():
|
|
78
|
+
input_data_filetype = '.' + os.path.basename(input_data_path).split('.')[1].lower()
|
|
79
|
+
input_is_pod5 = input_data_filetype in ['.pod5','.p5']
|
|
80
|
+
input_is_fast5 = input_data_filetype in ['.fast5','.f5']
|
|
81
|
+
input_is_fastq = input_data_filetype in ['.fastq', '.fq']
|
|
82
|
+
input_is_bam = input_data_filetype == bam_suffix
|
|
83
|
+
if input_is_fastq:
|
|
84
|
+
fastq_paths = [input_data_path]
|
|
85
|
+
elif Path(input_data_path).is_dir():
|
|
86
|
+
# Get the file names in the input data dir
|
|
87
|
+
input_files = os.listdir(input_data_path)
|
|
88
|
+
input_is_pod5 = sum([True for file in input_files if '.pod5' in file or '.p5' in file])
|
|
89
|
+
input_is_fast5 = sum([True for file in input_files if '.fast5' in file or '.f5' in file])
|
|
90
|
+
input_is_fastq = sum([True for file in input_files if '.fastq' in file or '.fq' in file])
|
|
91
|
+
input_is_bam = sum([True for file in input_files if bam_suffix in file])
|
|
92
|
+
if input_is_fastq:
|
|
93
|
+
fastq_paths = [os.path.join(input_data_path, file) for file in input_files if '.fastq' in file or '.fq' in file]
|
|
94
|
+
|
|
95
|
+
# If the input files are not pod5 files, and they are fast5 files, convert the files to a pod5 file before proceeding.
|
|
96
|
+
if input_is_fast5 and not input_is_pod5:
|
|
97
|
+
# take the input directory of fast5 files and write out a single pod5 file into the output directory.
|
|
98
|
+
output_pod5 = os.path.join(output_directory, 'FAST5s_to_POD5.pod5')
|
|
99
|
+
print(f'Input directory contains fast5 files, converting them and concatenating into a single pod5 file in the {output_pod5}')
|
|
100
|
+
fast5_to_pod5(input_data_path, output_pod5)
|
|
101
|
+
# Reassign the pod5_dir variable to point to the new pod5 file.
|
|
102
|
+
input_data_path = output_pod5
|
|
103
|
+
input_is_pod5 = True
|
|
104
|
+
input_is_fast5 = False
|
|
105
|
+
|
|
106
|
+
elif input_is_fastq:
|
|
107
|
+
output_bam = os.path.join(output_directory, 'FASTQs_concatenated_into_BAM.bam')
|
|
108
|
+
concatenate_fastqs_to_bam(fastq_paths, output_bam, barcode_tag='BC', gzip_suffix='.gz')
|
|
109
|
+
input_data_path = output_bam
|
|
110
|
+
input_is_bam = True
|
|
111
|
+
input_is_fastq = False
|
|
112
|
+
|
|
113
|
+
if input_is_pod5:
|
|
114
|
+
basecall = True
|
|
115
|
+
elif input_is_bam:
|
|
116
|
+
basecall = False
|
|
117
|
+
else:
|
|
118
|
+
print('Error, can not find input bam or pod5')
|
|
119
|
+
|
|
120
|
+
if smf_modality == 'conversion':
|
|
121
|
+
from .conversion_smf import conversion_smf
|
|
122
|
+
conversion_smf(fasta, output_directory, conversions, strands, model, input_data_path, split_path, barcode_kit, mapping_threshold, experiment_name, bam_suffix, basecall)
|
|
123
|
+
elif smf_modality == 'direct':
|
|
124
|
+
from .direct_smf import direct_smf
|
|
125
|
+
direct_smf(fasta, output_directory, mod_list, model, thresholds, input_data_path, split_path, barcode_kit, mapping_threshold, experiment_name, bam_suffix, batch_size, basecall)
|
|
126
|
+
else:
|
|
127
|
+
print("Error")
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# subsample_fasta_from_bed
|
|
2
|
+
|
|
3
|
+
def subsample_fasta_from_bed(input_FASTA, input_bed, output_directory, output_FASTA):
|
|
4
|
+
"""
|
|
5
|
+
Take a genome-wide FASTA file and a bed file containing coordinate windows of interest. Outputs a subsampled FASTA.
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
input_FASTA (str): String representing the path to the input FASTA file.
|
|
9
|
+
input_bed (str): String representing the path to the input BED file.
|
|
10
|
+
output_directory (str): String representing the path to the output directory for the new FASTA file.
|
|
11
|
+
output_FASTA (str): Name of the output FASTA.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
None
|
|
15
|
+
"""
|
|
16
|
+
from pyfaidx import Fasta
|
|
17
|
+
import os
|
|
18
|
+
|
|
19
|
+
# Load the FASTA file using pyfaidx
|
|
20
|
+
fasta = Fasta(input_FASTA)
|
|
21
|
+
|
|
22
|
+
output_FASTA_path = os.path.join(output_directory, output_FASTA)
|
|
23
|
+
|
|
24
|
+
# Open the BED file
|
|
25
|
+
with open(input_bed, 'r') as bed, open(output_FASTA_path, 'w') as out_fasta:
|
|
26
|
+
for line in bed:
|
|
27
|
+
# Each line in BED file contains: chrom, start, end (and possibly more columns)
|
|
28
|
+
fields = line.strip().split()
|
|
29
|
+
n_fields = len(fields)
|
|
30
|
+
chrom = fields[0]
|
|
31
|
+
start = int(fields[1]) # BED is 0-based
|
|
32
|
+
end = int(fields[2]) # BED is 0-based and end is exclusive
|
|
33
|
+
if n_fields > 3:
|
|
34
|
+
description = " ".join(fields[3:])
|
|
35
|
+
|
|
36
|
+
# Check if the chromosome exists in the FASTA file
|
|
37
|
+
if chrom in fasta:
|
|
38
|
+
# pyfaidx is 1-based, so convert coordinates accordingly
|
|
39
|
+
sequence = fasta[chrom][start:end].seq
|
|
40
|
+
# Write the sequence to the output FASTA file
|
|
41
|
+
if n_fields > 3:
|
|
42
|
+
out_fasta.write(f">{chrom}:{start}-{end} {description}\n")
|
|
43
|
+
else:
|
|
44
|
+
out_fasta.write(f">{chrom}:{start}-{end}\n")
|
|
45
|
+
out_fasta.write(f"{sequence}\n")
|
|
46
|
+
else:
|
|
47
|
+
print(f"Warning: {chrom} not found in the FASTA file")
|
|
@@ -6,7 +6,7 @@ def subsample_pod5(pod5_path, read_name_path, output_directory):
|
|
|
6
6
|
This is a useful function when you have a list of read names that mapped to a region of interest that you want to reanalyze from the pod5 level.
|
|
7
7
|
|
|
8
8
|
Parameters:
|
|
9
|
-
pod5_path (str): File path to the POD5 to subsample.
|
|
9
|
+
pod5_path (str): File path to the POD5 file (or directory of multiple pod5 files) to subsample.
|
|
10
10
|
read_name_path (str | int): File path to a text file of read names. One read name per line. If an int value is passed, a random subset of that many reads will occur
|
|
11
11
|
output_directory (str): A file path to the directory to output the file.
|
|
12
12
|
|
|
@@ -16,30 +16,86 @@ def subsample_pod5(pod5_path, read_name_path, output_directory):
|
|
|
16
16
|
import pod5 as p5
|
|
17
17
|
import os
|
|
18
18
|
|
|
19
|
-
|
|
19
|
+
if os.path.isdir(pod5_path):
|
|
20
|
+
pod5_path_is_dir = True
|
|
21
|
+
input_pod5_base = 'input_pod5s.pod5'
|
|
22
|
+
files = os.listdir(pod5_path)
|
|
23
|
+
pod5_files = [os.path.join(pod5_path, file) for file in files if '.pod5' in file]
|
|
24
|
+
pod5_files.sort()
|
|
25
|
+
print(f'Found input pod5s: {pod5_files}')
|
|
26
|
+
|
|
27
|
+
elif os.path.exists(pod5_path):
|
|
28
|
+
pod5_path_is_dir = False
|
|
29
|
+
input_pod5_base = os.path.basename(pod5_path)
|
|
30
|
+
|
|
31
|
+
else:
|
|
32
|
+
print('Error: pod5_path passed does not exist')
|
|
33
|
+
return None
|
|
20
34
|
|
|
21
35
|
if type(read_name_path) == str:
|
|
22
36
|
input_read_name_base = os.path.basename(read_name_path)
|
|
23
37
|
output_base = input_pod5_base.split('.pod5')[0] + '_' + input_read_name_base.split('.txt')[0] + '_subsampled.pod5'
|
|
38
|
+
|
|
24
39
|
# extract read names into a list of strings
|
|
25
40
|
with open(read_name_path, 'r') as file:
|
|
26
41
|
read_names = [line.strip() for line in file]
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
42
|
+
|
|
43
|
+
print(f'Looking for read_ids: {read_names}')
|
|
44
|
+
read_records = []
|
|
45
|
+
|
|
46
|
+
if pod5_path_is_dir:
|
|
47
|
+
for input_pod5 in pod5_files:
|
|
48
|
+
with p5.Reader(input_pod5) as reader:
|
|
49
|
+
try:
|
|
50
|
+
for read_record in reader.reads(selection=read_names, missing_ok=True):
|
|
51
|
+
read_records.append(read_record.to_read())
|
|
52
|
+
print(f'Found read in {input_pod5}: {read_record.read_id}')
|
|
53
|
+
except:
|
|
54
|
+
print('Skipping pod5, could not find reads')
|
|
55
|
+
else:
|
|
56
|
+
with p5.Reader(pod5_path) as reader:
|
|
57
|
+
try:
|
|
58
|
+
for read_record in reader.reads(selection=read_names):
|
|
59
|
+
read_records.append(read_record.to_read())
|
|
60
|
+
print(f'Found read in {input_pod5}: {read_record}')
|
|
61
|
+
except:
|
|
62
|
+
print('Could not find reads')
|
|
31
63
|
|
|
32
64
|
elif type(read_name_path) == int:
|
|
33
65
|
import random
|
|
34
66
|
output_base = input_pod5_base.split('.pod5')[0] + f'_{read_name_path}_randomly_subsampled.pod5'
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
67
|
+
all_read_records = []
|
|
68
|
+
|
|
69
|
+
if pod5_path_is_dir:
|
|
70
|
+
# Shuffle the list of input pod5 paths
|
|
71
|
+
random.shuffle(pod5_files)
|
|
72
|
+
for input_pod5 in pod5_files:
|
|
73
|
+
# iterate over the input pod5s
|
|
74
|
+
print(f'Opening pod5 file {input_pod5}')
|
|
75
|
+
with p5.Reader(pod5_path) as reader:
|
|
76
|
+
for read_record in reader.reads():
|
|
77
|
+
all_read_records.append(read_record.to_read())
|
|
78
|
+
# When enough reads are in all_read_records, stop accumulating reads.
|
|
79
|
+
if len(all_read_records) >= read_name_path:
|
|
80
|
+
break
|
|
81
|
+
|
|
82
|
+
if read_name_path <= len(all_read_records):
|
|
83
|
+
read_records = random.sample(all_read_records, read_name_path)
|
|
84
|
+
else:
|
|
85
|
+
print('Trying to sample more reads than are contained in the input pod5s, taking all reads')
|
|
86
|
+
read_records = all_read_records
|
|
87
|
+
|
|
41
88
|
else:
|
|
42
|
-
|
|
89
|
+
with p5.Reader(pod5_path) as reader:
|
|
90
|
+
for read_record in reader.reads():
|
|
91
|
+
# get all read records from the input pod5
|
|
92
|
+
all_read_records.append(read_record.to_read())
|
|
93
|
+
if read_name_path <= len(all_read_records):
|
|
94
|
+
# if the subsampling amount is less than the record amount in the file, randomly subsample the reads
|
|
95
|
+
read_records = random.sample(all_read_records, read_name_path)
|
|
96
|
+
else:
|
|
97
|
+
print('Trying to sample more reads than are contained in the input pod5s, taking all reads')
|
|
98
|
+
read_records = all_read_records
|
|
43
99
|
|
|
44
100
|
output_pod5 = os.path.join(output_directory, output_base)
|
|
45
101
|
|
|
@@ -9,8 +9,10 @@ from .clean_NaN import clean_NaN
|
|
|
9
9
|
from .filter_converted_reads_on_methylation import filter_converted_reads_on_methylation
|
|
10
10
|
from .filter_reads_on_length import filter_reads_on_length
|
|
11
11
|
from .invert_adata import invert_adata
|
|
12
|
+
from .load_sample_sheet import load_sample_sheet
|
|
12
13
|
from .mark_duplicates import mark_duplicates
|
|
13
14
|
from .remove_duplicates import remove_duplicates
|
|
15
|
+
from .recipes import recipe_1_Kissiov_and_McKenna_2025, recipe_2_Kissiov_and_McKenna_2025
|
|
14
16
|
|
|
15
17
|
__all__ = [
|
|
16
18
|
"append_C_context",
|
|
@@ -24,6 +26,9 @@ __all__ = [
|
|
|
24
26
|
"filter_converted_reads_on_methylation",
|
|
25
27
|
"filter_reads_on_length",
|
|
26
28
|
"invert_adata",
|
|
29
|
+
"load_sample_sheet",
|
|
27
30
|
"mark_duplicates",
|
|
28
|
-
"remove_duplicates"
|
|
31
|
+
"remove_duplicates",
|
|
32
|
+
"recipe_1_Kissiov_and_McKenna_2025",
|
|
33
|
+
"recipe_2_Kissiov_and_McKenna_2025"
|
|
29
34
|
]
|
|
@@ -17,29 +17,52 @@ def append_C_context(adata, obs_column='Reference', use_consensus=False):
|
|
|
17
17
|
"""
|
|
18
18
|
import numpy as np
|
|
19
19
|
import anndata as ad
|
|
20
|
-
site_types = ['GpC_site', 'CpG_site', '
|
|
20
|
+
site_types = ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C']
|
|
21
21
|
categories = adata.obs[obs_column].cat.categories
|
|
22
22
|
for cat in categories:
|
|
23
|
+
# Assess if the strand is the top or bottom strand converted
|
|
24
|
+
if 'top' in cat:
|
|
25
|
+
strand = 'top'
|
|
26
|
+
elif 'bottom' in cat:
|
|
27
|
+
strand = 'bottom'
|
|
28
|
+
|
|
23
29
|
if use_consensus:
|
|
24
30
|
sequence = adata.uns[f'{cat}_consensus_sequence']
|
|
25
31
|
else:
|
|
32
|
+
# This sequence is the unconverted FASTA sequence of the original input FASTA for the locus
|
|
26
33
|
sequence = adata.uns[f'{cat}_FASTA_sequence']
|
|
34
|
+
# Init a dict keyed by reference site type that points to a bool of whether the position is that site type.
|
|
27
35
|
boolean_dict = {}
|
|
28
36
|
for site_type in site_types:
|
|
29
37
|
boolean_dict[f'{cat}_{site_type}'] = np.full(len(sequence), False, dtype=bool)
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
38
|
+
|
|
39
|
+
if strand == 'top':
|
|
40
|
+
# Iterate through the sequence and apply the criteria
|
|
41
|
+
for i in range(1, len(sequence) - 1):
|
|
42
|
+
if sequence[i] == 'C':
|
|
43
|
+
if sequence[i - 1] == 'G' and sequence[i + 1] != 'G':
|
|
44
|
+
boolean_dict[f'{cat}_GpC_site'][i] = True
|
|
45
|
+
elif sequence[i - 1] == 'G' and sequence[i + 1] == 'G':
|
|
46
|
+
boolean_dict[f'{cat}_ambiguous_GpC_CpG_site'][i] = True
|
|
47
|
+
elif sequence[i - 1] != 'G' and sequence[i + 1] == 'G':
|
|
48
|
+
boolean_dict[f'{cat}_CpG_site'][i] = True
|
|
49
|
+
elif sequence[i - 1] != 'G' and sequence[i + 1] != 'G':
|
|
50
|
+
boolean_dict[f'{cat}_other_C'][i] = True
|
|
51
|
+
elif strand == 'bottom':
|
|
52
|
+
# Iterate through the sequence and apply the criteria
|
|
53
|
+
for i in range(1, len(sequence) - 1):
|
|
54
|
+
if sequence[i] == 'G':
|
|
55
|
+
if sequence[i + 1] == 'C' and sequence[i - 1] != 'C':
|
|
56
|
+
boolean_dict[f'{cat}_GpC_site'][i] = True
|
|
57
|
+
elif sequence[i - 1] == 'C' and sequence[i + 1] == 'C':
|
|
58
|
+
boolean_dict[f'{cat}_ambiguous_GpC_CpG_site'][i] = True
|
|
59
|
+
elif sequence[i - 1] == 'C' and sequence[i + 1] != 'C':
|
|
60
|
+
boolean_dict[f'{cat}_CpG_site'][i] = True
|
|
61
|
+
elif sequence[i - 1] != 'C' and sequence[i + 1] != 'C':
|
|
62
|
+
boolean_dict[f'{cat}_other_C'][i] = True
|
|
63
|
+
else:
|
|
64
|
+
print('Error: top or bottom strand of conversion could not be determined. Ensure this value is in the Reference name.')
|
|
65
|
+
|
|
43
66
|
for site_type in site_types:
|
|
44
67
|
adata.var[f'{cat}_{site_type}'] = boolean_dict[f'{cat}_{site_type}'].astype(bool)
|
|
45
68
|
adata.obsm[f'{cat}_{site_type}'] = adata[:, adata.var[f'{cat}_{site_type}'] == True].copy().X
|
|
@@ -1,16 +1,16 @@
|
|
|
1
1
|
## calculate_complexity
|
|
2
2
|
|
|
3
|
-
def calculate_complexity(adata, obs_column='Reference', sample_col='Sample_names', plot=True, save_plot=False
|
|
3
|
+
def calculate_complexity(adata, output_directory='', obs_column='Reference', sample_col='Sample_names', plot=True, save_plot=False):
|
|
4
4
|
"""
|
|
5
5
|
A complexity analysis of the library.
|
|
6
6
|
|
|
7
7
|
Parameters:
|
|
8
8
|
adata (AnnData): An adata object with mark_duplicates already run.
|
|
9
|
+
output_directory (str): String representing the path to the output directory.
|
|
9
10
|
obs_column (str): String of the obs column to iterate over.
|
|
10
11
|
sample_col (str): String of the sample column to iterate over.
|
|
11
12
|
plot (bool): Whether to plot the complexity model.
|
|
12
13
|
save_plot (bool): Whether to save the complexity model.
|
|
13
|
-
output_directory (str): String representing the path to the output directory.
|
|
14
14
|
|
|
15
15
|
Returns:
|
|
16
16
|
None
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# calculate_consensus
|
|
2
|
+
|
|
3
|
+
def calculate_consensus(adata, reference, sample=False, reference_column='Reference', sample_column='Sample'):
|
|
4
|
+
"""
|
|
5
|
+
Takes an input AnnData object, the reference to subset on, and the sample name to subset on to calculate the consensus sequence of the read set.
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
adata (AnnData): The input adata to append consensus metadata to.
|
|
9
|
+
reference (str): The name of the reference to subset the adata on.
|
|
10
|
+
sample (bool | str): If False, uses all samples. If a string is passed, the adata is further subsetted to only analyze that sample.
|
|
11
|
+
reference_column (str): The name of the reference column (Default is 'Reference')
|
|
12
|
+
sample_column (str): The name of the sample column (Default is 'Sample)
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
None
|
|
16
|
+
|
|
17
|
+
"""
|
|
18
|
+
import numpy as np
|
|
19
|
+
|
|
20
|
+
# Subset the adata on the refernce of interest. Optionally, subset additionally on a sample of interest.
|
|
21
|
+
record_subset = adata[adata.obs[reference_column] == reference].copy()
|
|
22
|
+
if sample:
|
|
23
|
+
record_subset = record_subset[record_subset.obs[sample_column] == sample].copy()
|
|
24
|
+
else:
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
# Grab layer names from the adata object that correspond to the binary encodings of the read sequences.
|
|
28
|
+
layers = [layer for layer in record_subset.layers if '_binary_' in layer]
|
|
29
|
+
layer_map, layer_counts = {}, []
|
|
30
|
+
for i, layer in enumerate(layers):
|
|
31
|
+
# Gives an integer mapping to access which sequence base the binary layer is encoding
|
|
32
|
+
layer_map[i] = layer.split('_')[0]
|
|
33
|
+
# Get the positional counts from all reads for the given base identity.
|
|
34
|
+
layer_counts.append(np.sum(record_subset.layers[layer], axis=0))
|
|
35
|
+
# Combine the positional counts array derived from each binary base layer into an ndarray
|
|
36
|
+
count_array = np.array(layer_counts)
|
|
37
|
+
# Determine the row index that contains the largest count for each position and store this in an array.
|
|
38
|
+
nucleotide_indexes = np.argmax(count_array, axis=0)
|
|
39
|
+
# Map the base sequence derived from the row index array to attain the consensus sequence in a list.
|
|
40
|
+
consensus_sequence_list = [layer_map[i] for i in nucleotide_indexes]
|
|
41
|
+
|
|
42
|
+
if sample:
|
|
43
|
+
adata.var[f'{reference}_consensus_from_{sample}'] = consensus_sequence_list
|
|
44
|
+
else:
|
|
45
|
+
adata.var[f'{reference}_consensus_across_samples'] = consensus_sequence_list
|
|
46
|
+
|
|
47
|
+
adata.uns[f'{reference}_consensus_sequence'] = consensus_sequence_list
|