smftools 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +29 -0
- smftools/_settings.py +20 -0
- smftools/_version.py +1 -0
- smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
- smftools/datasets/F1_sample_sheet.csv +5 -0
- smftools/datasets/__init__.py +9 -0
- smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
- smftools/datasets/datasets.py +28 -0
- smftools/informatics/__init__.py +16 -0
- smftools/informatics/archived/bam_conversion.py +59 -0
- smftools/informatics/archived/bam_direct.py +63 -0
- smftools/informatics/archived/basecalls_to_adata.py +71 -0
- smftools/informatics/archived/print_bam_query_seq.py +29 -0
- smftools/informatics/basecall_pod5s.py +80 -0
- smftools/informatics/conversion_smf.py +132 -0
- smftools/informatics/direct_smf.py +137 -0
- smftools/informatics/fast5_to_pod5.py +21 -0
- smftools/informatics/helpers/LoadExperimentConfig.py +75 -0
- smftools/informatics/helpers/__init__.py +74 -0
- smftools/informatics/helpers/align_and_sort_BAM.py +59 -0
- smftools/informatics/helpers/aligned_BAM_to_bed.py +74 -0
- smftools/informatics/helpers/archived/informatics.py +260 -0
- smftools/informatics/helpers/archived/load_adata.py +516 -0
- smftools/informatics/helpers/bam_qc.py +66 -0
- smftools/informatics/helpers/bed_to_bigwig.py +39 -0
- smftools/informatics/helpers/binarize_converted_base_identities.py +79 -0
- smftools/informatics/helpers/canoncall.py +34 -0
- smftools/informatics/helpers/complement_base_list.py +21 -0
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +55 -0
- smftools/informatics/helpers/converted_BAM_to_adata.py +245 -0
- smftools/informatics/helpers/converted_BAM_to_adata_II.py +369 -0
- smftools/informatics/helpers/count_aligned_reads.py +43 -0
- smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
- smftools/informatics/helpers/extract_base_identities.py +44 -0
- smftools/informatics/helpers/extract_mods.py +83 -0
- smftools/informatics/helpers/extract_read_features_from_bam.py +31 -0
- smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
- smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
- smftools/informatics/helpers/find_conversion_sites.py +50 -0
- smftools/informatics/helpers/generate_converted_FASTA.py +99 -0
- smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
- smftools/informatics/helpers/get_native_references.py +28 -0
- smftools/informatics/helpers/index_fasta.py +12 -0
- smftools/informatics/helpers/make_dirs.py +21 -0
- smftools/informatics/helpers/make_modbed.py +27 -0
- smftools/informatics/helpers/modQC.py +27 -0
- smftools/informatics/helpers/modcall.py +36 -0
- smftools/informatics/helpers/modkit_extract_to_adata.py +884 -0
- smftools/informatics/helpers/ohe_batching.py +76 -0
- smftools/informatics/helpers/ohe_layers_decode.py +32 -0
- smftools/informatics/helpers/one_hot_decode.py +27 -0
- smftools/informatics/helpers/one_hot_encode.py +57 -0
- smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +53 -0
- smftools/informatics/helpers/run_multiqc.py +28 -0
- smftools/informatics/helpers/separate_bam_by_bc.py +43 -0
- smftools/informatics/helpers/split_and_index_BAM.py +36 -0
- smftools/informatics/load_adata.py +182 -0
- smftools/informatics/readwrite.py +106 -0
- smftools/informatics/subsample_fasta_from_bed.py +47 -0
- smftools/informatics/subsample_pod5.py +104 -0
- smftools/plotting/__init__.py +15 -0
- smftools/plotting/classifiers.py +355 -0
- smftools/plotting/general_plotting.py +205 -0
- smftools/plotting/position_stats.py +462 -0
- smftools/preprocessing/__init__.py +33 -0
- smftools/preprocessing/append_C_context.py +82 -0
- smftools/preprocessing/archives/mark_duplicates.py +146 -0
- smftools/preprocessing/archives/preprocessing.py +614 -0
- smftools/preprocessing/archives/remove_duplicates.py +21 -0
- smftools/preprocessing/binarize_on_Youden.py +45 -0
- smftools/preprocessing/binary_layers_to_ohe.py +40 -0
- smftools/preprocessing/calculate_complexity.py +72 -0
- smftools/preprocessing/calculate_consensus.py +47 -0
- smftools/preprocessing/calculate_converted_read_methylation_stats.py +94 -0
- smftools/preprocessing/calculate_coverage.py +42 -0
- smftools/preprocessing/calculate_pairwise_differences.py +49 -0
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +27 -0
- smftools/preprocessing/calculate_position_Youden.py +115 -0
- smftools/preprocessing/calculate_read_length_stats.py +79 -0
- smftools/preprocessing/clean_NaN.py +46 -0
- smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
- smftools/preprocessing/filter_converted_reads_on_methylation.py +44 -0
- smftools/preprocessing/filter_reads_on_length.py +51 -0
- smftools/preprocessing/flag_duplicate_reads.py +149 -0
- smftools/preprocessing/invert_adata.py +30 -0
- smftools/preprocessing/load_sample_sheet.py +38 -0
- smftools/preprocessing/make_dirs.py +21 -0
- smftools/preprocessing/min_non_diagonal.py +25 -0
- smftools/preprocessing/recipes.py +127 -0
- smftools/preprocessing/subsample_adata.py +58 -0
- smftools/readwrite.py +198 -0
- smftools/tools/__init__.py +49 -0
- smftools/tools/apply_hmm.py +202 -0
- smftools/tools/apply_hmm_batched.py +241 -0
- smftools/tools/archived/classify_methylated_features.py +66 -0
- smftools/tools/archived/classify_non_methylated_features.py +75 -0
- smftools/tools/archived/subset_adata_v1.py +32 -0
- smftools/tools/archived/subset_adata_v2.py +46 -0
- smftools/tools/calculate_distances.py +18 -0
- smftools/tools/calculate_umap.py +62 -0
- smftools/tools/call_hmm_peaks.py +105 -0
- smftools/tools/classifiers.py +787 -0
- smftools/tools/cluster_adata_on_methylation.py +105 -0
- smftools/tools/data/__init__.py +2 -0
- smftools/tools/data/anndata_data_module.py +90 -0
- smftools/tools/data/preprocessing.py +6 -0
- smftools/tools/display_hmm.py +18 -0
- smftools/tools/evaluation/__init__.py +0 -0
- smftools/tools/general_tools.py +69 -0
- smftools/tools/hmm_readwrite.py +16 -0
- smftools/tools/inference/__init__.py +1 -0
- smftools/tools/inference/lightning_inference.py +41 -0
- smftools/tools/models/__init__.py +9 -0
- smftools/tools/models/base.py +14 -0
- smftools/tools/models/cnn.py +34 -0
- smftools/tools/models/lightning_base.py +41 -0
- smftools/tools/models/mlp.py +17 -0
- smftools/tools/models/positional.py +17 -0
- smftools/tools/models/rnn.py +16 -0
- smftools/tools/models/sklearn_models.py +40 -0
- smftools/tools/models/transformer.py +133 -0
- smftools/tools/models/wrappers.py +20 -0
- smftools/tools/nucleosome_hmm_refinement.py +104 -0
- smftools/tools/position_stats.py +239 -0
- smftools/tools/read_stats.py +70 -0
- smftools/tools/subset_adata.py +28 -0
- smftools/tools/train_hmm.py +78 -0
- smftools/tools/training/__init__.py +1 -0
- smftools/tools/training/train_lightning_model.py +47 -0
- smftools/tools/utils/__init__.py +2 -0
- smftools/tools/utils/device.py +10 -0
- smftools/tools/utils/grl.py +14 -0
- {smftools-0.1.6.dist-info → smftools-0.1.7.dist-info}/METADATA +5 -2
- smftools-0.1.7.dist-info/RECORD +136 -0
- smftools-0.1.6.dist-info/RECORD +0 -4
- {smftools-0.1.6.dist-info → smftools-0.1.7.dist-info}/WHEEL +0 -0
- {smftools-0.1.6.dist-info → smftools-0.1.7.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
def binarize_converted_base_identities(base_identities, strand, modification_type, bam, device='cpu'):
|
|
2
|
+
"""
|
|
3
|
+
Efficiently binarizes conversion SMF data within a sequence string using NumPy arrays.
|
|
4
|
+
|
|
5
|
+
Parameters:
|
|
6
|
+
base_identities (dict): A dictionary returned by extract_base_identities. Keyed by read name. Points to a list of base identities.
|
|
7
|
+
strand (str): A string indicating which strand was converted in the experiment (options are 'top' and 'bottom').
|
|
8
|
+
modification_type (str): A string indicating the modification type of interest (options are '5mC' and '6mA').
|
|
9
|
+
bam (str): The bam file path
|
|
10
|
+
|
|
11
|
+
Returns:
|
|
12
|
+
dict: A dictionary where 1 represents a methylated site, 0 represents an unmethylated site, and NaN represents a site without methylation info.
|
|
13
|
+
"""
|
|
14
|
+
import numpy as np
|
|
15
|
+
|
|
16
|
+
# If the modification type is 'unconverted', return NaN for all positions
|
|
17
|
+
if modification_type == "unconverted":
|
|
18
|
+
#print(f"Skipping binarization for unconverted {strand} reads on bam: {bam}.")
|
|
19
|
+
return {key: np.full(len(bases), np.nan) for key, bases in base_identities.items()}
|
|
20
|
+
|
|
21
|
+
# Define mappings for binarization based on strand and modification type
|
|
22
|
+
binarization_maps = {
|
|
23
|
+
('top', '5mC'): {'C': 1, 'T': 0},
|
|
24
|
+
('top', '6mA'): {'A': 1, 'G': 0},
|
|
25
|
+
('bottom', '5mC'): {'G': 1, 'A': 0},
|
|
26
|
+
('bottom', '6mA'): {'T': 1, 'C': 0}
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
if (strand, modification_type) not in binarization_maps:
|
|
30
|
+
raise ValueError(f"Invalid combination of strand='{strand}' and modification_type='{modification_type}'")
|
|
31
|
+
|
|
32
|
+
# Fetch the appropriate mapping
|
|
33
|
+
base_map = binarization_maps[(strand, modification_type)]
|
|
34
|
+
|
|
35
|
+
binarized_base_identities = {}
|
|
36
|
+
for key, bases in base_identities.items():
|
|
37
|
+
arr = np.array(bases, dtype='<U1')
|
|
38
|
+
binarized = np.vectorize(lambda x: base_map.get(x, np.nan))(arr) # Apply mapping with fallback to NaN
|
|
39
|
+
binarized_base_identities[key] = binarized
|
|
40
|
+
|
|
41
|
+
return binarized_base_identities
|
|
42
|
+
# import torch
|
|
43
|
+
|
|
44
|
+
# # If the modification type is 'unconverted', return NaN for all positions
|
|
45
|
+
# if modification_type == "unconverted":
|
|
46
|
+
# print(f"Skipping binarization for unconverted {strand} reads on bam: {bam}.")
|
|
47
|
+
# return {key: torch.full((len(bases),), float('nan'), device=device) for key, bases in base_identities.items()}
|
|
48
|
+
|
|
49
|
+
# # Define mappings for binarization based on strand and modification type
|
|
50
|
+
# binarization_maps = {
|
|
51
|
+
# ('top', '5mC'): {'C': 1, 'T': 0},
|
|
52
|
+
# ('top', '6mA'): {'A': 1, 'G': 0},
|
|
53
|
+
# ('bottom', '5mC'): {'G': 1, 'A': 0},
|
|
54
|
+
# ('bottom', '6mA'): {'T': 1, 'C': 0}
|
|
55
|
+
# }
|
|
56
|
+
|
|
57
|
+
# if (strand, modification_type) not in binarization_maps:
|
|
58
|
+
# raise ValueError(f"Invalid combination of strand='{strand}' and modification_type='{modification_type}'")
|
|
59
|
+
|
|
60
|
+
# # Fetch the appropriate mapping
|
|
61
|
+
# base_map = binarization_maps[(strand, modification_type)]
|
|
62
|
+
|
|
63
|
+
# # Convert mapping to tensor
|
|
64
|
+
# base_keys = list(base_map.keys())
|
|
65
|
+
# base_values = torch.tensor(list(base_map.values()), dtype=torch.float32, device=device)
|
|
66
|
+
|
|
67
|
+
# # Create a lookup dictionary (ASCII-based for fast mapping)
|
|
68
|
+
# lookup_table = torch.full((256,), float('nan'), dtype=torch.float32, device=device)
|
|
69
|
+
# for k, v in zip(base_keys, base_values):
|
|
70
|
+
# lookup_table[ord(k)] = v
|
|
71
|
+
|
|
72
|
+
# # Process reads
|
|
73
|
+
# binarized_base_identities = {}
|
|
74
|
+
# for key, bases in base_identities.items():
|
|
75
|
+
# bases_tensor = torch.tensor([ord(c) for c in bases], dtype=torch.uint8, device=device) # Convert chars to ASCII
|
|
76
|
+
# binarized = lookup_table[bases_tensor] # Efficient lookup
|
|
77
|
+
# binarized_base_identities[key] = binarized
|
|
78
|
+
|
|
79
|
+
# return binarized_base_identities
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
## canoncall
|
|
2
|
+
|
|
3
|
+
# Conversion SMF specific
|
|
4
|
+
def canoncall(model_dir, model, pod5_dir, barcode_kit, bam, bam_suffix, barcode_both_ends=True, trim=False, device='auto'):
|
|
5
|
+
"""
|
|
6
|
+
Wrapper function for dorado canonical base calling.
|
|
7
|
+
|
|
8
|
+
Parameters:
|
|
9
|
+
model_dir (str): a string representing the file path to the dorado basecalling model directory.
|
|
10
|
+
model (str): a string representing the the dorado basecalling model.
|
|
11
|
+
pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
|
|
12
|
+
barcode_kit (str): A string reppresenting the barcoding kit used in the experiment.
|
|
13
|
+
bam (str): File path to the BAM file to output.
|
|
14
|
+
bam_suffix (str): The suffix to use for the BAM file.
|
|
15
|
+
barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
|
|
16
|
+
trim (bool): Whether to trim barcodes, adapters, and primers from read ends.
|
|
17
|
+
device (str): The device to use. 'auto' is default, which can detect device to use. Can also specify metal, cpu, cuda.
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
None
|
|
21
|
+
Outputs a BAM file holding the canonical base calls output by the dorado basecaller.
|
|
22
|
+
"""
|
|
23
|
+
import subprocess
|
|
24
|
+
output = bam + bam_suffix
|
|
25
|
+
command = ["dorado", "basecaller", "--models-directory", model_dir, "--kit-name", barcode_kit, "--device", device, "--batchsize", "0"]
|
|
26
|
+
if barcode_both_ends:
|
|
27
|
+
command.append("--barcode-both-ends")
|
|
28
|
+
if not trim:
|
|
29
|
+
command.append("--no-trim")
|
|
30
|
+
command += [model, pod5_dir]
|
|
31
|
+
command_string = " ".join(command)
|
|
32
|
+
print(f"Running {command_string}\n to generate {output}")
|
|
33
|
+
with open(output, "w") as outfile:
|
|
34
|
+
subprocess.run(command, stdout=outfile)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# complement_base_list
|
|
2
|
+
|
|
3
|
+
def complement_base_list(sequence):
|
|
4
|
+
"""
|
|
5
|
+
Takes a list of DNA base identities and returns their complement.
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
sequence (list): A list of DNA bases (e.g., ['A', 'C', 'G', 'T']).
|
|
9
|
+
|
|
10
|
+
Returns:
|
|
11
|
+
complement (list): A list of complementary DNA bases.
|
|
12
|
+
"""
|
|
13
|
+
complement_mapping = {
|
|
14
|
+
'A': 'T',
|
|
15
|
+
'T': 'A',
|
|
16
|
+
'C': 'G',
|
|
17
|
+
'G': 'C',
|
|
18
|
+
'N': 'N' # Handling ambiguous bases like 'N'
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
return [complement_mapping[base] for base in sequence]
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# concatenate_fastqs_to_bam
|
|
2
|
+
|
|
3
|
+
def concatenate_fastqs_to_bam(fastq_files, output_bam, barcode_tag='BC', gzip_suffix='.gz'):
|
|
4
|
+
"""
|
|
5
|
+
Concatenate multiple demultiplexed FASTQ (.fastq or .fq) files into an unaligned BAM and add the FASTQ barcode suffix to the BC tag.
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
fastq_files (list): List of paths to demultiplexed FASTQ files.
|
|
9
|
+
output_bam (str): Path to the output BAM file.
|
|
10
|
+
barcode_tag (str): The SAM tag for storing the barcode (default: 'BC').
|
|
11
|
+
gzip_suffix (str): Suffix to use for input gzip files (Defaul: '.gz')
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
None
|
|
15
|
+
"""
|
|
16
|
+
import os
|
|
17
|
+
import pysam
|
|
18
|
+
import gzip
|
|
19
|
+
from Bio import SeqIO
|
|
20
|
+
from tqdm import tqdm
|
|
21
|
+
|
|
22
|
+
n_fastqs = len(fastq_files)
|
|
23
|
+
|
|
24
|
+
with pysam.AlignmentFile(output_bam, "wb", header={"HD": {"VN": "1.0"}, "SQ": []}) as bam_out:
|
|
25
|
+
for fastq_file in tqdm(fastq_files, desc="Processing FASTQ files"):
|
|
26
|
+
# Extract barcode from the FASTQ filename (handles .fq, .fastq, .fq.gz, and .fastq.gz extensions)
|
|
27
|
+
base_name = os.path.basename(fastq_file)
|
|
28
|
+
if n_fastqs > 1:
|
|
29
|
+
if base_name.endswith('.fastq.gz'):
|
|
30
|
+
barcode = base_name.split('_')[-1].replace(f'.fastq{gzip_suffix}', '')
|
|
31
|
+
elif base_name.endswith('.fq.gz'):
|
|
32
|
+
barcode = base_name.split('_')[-1].replace(f'.fq{gzip_suffix}', '')
|
|
33
|
+
elif base_name.endswith('.fastq'):
|
|
34
|
+
barcode = base_name.split('_')[-1].replace('.fastq', '')
|
|
35
|
+
elif base_name.endswith('.fq'):
|
|
36
|
+
barcode = base_name.split('_')[-1].replace('.fq', '')
|
|
37
|
+
else:
|
|
38
|
+
raise ValueError(f"Unexpected file extension for {fastq_file}. Only .fq, .fastq, .fq{gzip_suffix}, and .fastq{gzip_suffix} are supported.")
|
|
39
|
+
else:
|
|
40
|
+
barcode = 'barcode0'
|
|
41
|
+
|
|
42
|
+
# Read the FASTQ file (handle gzipped and non-gzipped files)
|
|
43
|
+
open_func = gzip.open if fastq_file.endswith(gzip_suffix) else open
|
|
44
|
+
with open_func(fastq_file, 'rt') as fq_in:
|
|
45
|
+
for record in SeqIO.parse(fq_in, 'fastq'):
|
|
46
|
+
# Create an unaligned BAM entry for each FASTQ record
|
|
47
|
+
aln = pysam.AlignedSegment()
|
|
48
|
+
aln.query_name = record.id
|
|
49
|
+
aln.query_sequence = str(record.seq)
|
|
50
|
+
aln.flag = 4 # Unmapped
|
|
51
|
+
aln.query_qualities = pysam.qualitystring_to_array(record.letter_annotations["phred_quality"])
|
|
52
|
+
# Add the barcode to the BC tag
|
|
53
|
+
aln.set_tag(barcode_tag, barcode)
|
|
54
|
+
# Write to BAM file
|
|
55
|
+
bam_out.write(aln)
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
## converted_BAM_to_adata
|
|
2
|
+
|
|
3
|
+
def converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experiment_name, conversion_types, bam_suffix):
|
|
4
|
+
"""
|
|
5
|
+
A wrapper function to take converted aligned_sorted_split BAM files and format the data into an anndata object.
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
converted_FASTA (str): A string representing the file path to the converted FASTA reference.
|
|
9
|
+
split_dir (str): A string representing the file path to the directory containing the converted aligned_sorted_split BAM files.
|
|
10
|
+
mapping_threshold (float): A value in between 0 and 1 to threshold the minimal fraction of aligned reads which map to the reference region. References with values above the threshold are included in the output adata.
|
|
11
|
+
experiment_name (str): A string to provide an experiment name to the output adata file.
|
|
12
|
+
conversion_types (list): A list of strings of the conversion types to use in the analysis.
|
|
13
|
+
bam_suffix (str): The suffix to use for the BAM file.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
final_adata_path (str): File path to the final adata object
|
|
17
|
+
Outputs a single gzipped adata object for the experiment.
|
|
18
|
+
"""
|
|
19
|
+
from .. import readwrite
|
|
20
|
+
from .binarize_converted_base_identities import binarize_converted_base_identities
|
|
21
|
+
from .find_conversion_sites import find_conversion_sites
|
|
22
|
+
from .count_aligned_reads import count_aligned_reads
|
|
23
|
+
from .extract_base_identities import extract_base_identities
|
|
24
|
+
from .make_dirs import make_dirs
|
|
25
|
+
from .ohe_batching import ohe_batching
|
|
26
|
+
import pandas as pd
|
|
27
|
+
import numpy as np
|
|
28
|
+
import anndata as ad
|
|
29
|
+
import os
|
|
30
|
+
from tqdm import tqdm
|
|
31
|
+
import gc
|
|
32
|
+
|
|
33
|
+
##########################################################################################
|
|
34
|
+
## Get file paths and make necessary directories. ##
|
|
35
|
+
# Get all of the input BAM files
|
|
36
|
+
files = os.listdir(split_dir)
|
|
37
|
+
# Make output dir
|
|
38
|
+
parent_dir = os.path.dirname(split_dir)
|
|
39
|
+
split_dir_base = os.path.basename(split_dir)
|
|
40
|
+
h5_dir = os.path.join(parent_dir, 'h5ads')
|
|
41
|
+
final_adata_path = os.path.join(h5_dir, f'{experiment_name}_{split_dir_base}.h5ad')
|
|
42
|
+
|
|
43
|
+
if os.path.exists(f"{final_adata_path}.gz"):
|
|
44
|
+
print(f'{final_adata_path}.gz already exists, using existing adata object') # Stops here if the final_adata file already exists
|
|
45
|
+
return final_adata_path
|
|
46
|
+
|
|
47
|
+
tmp_dir = os.path.join(parent_dir, 'tmp')
|
|
48
|
+
make_dirs([h5_dir, tmp_dir])
|
|
49
|
+
# Filter file names that contain the search string in their filename and keep them in a list
|
|
50
|
+
bams = [bam for bam in files if bam_suffix in bam and '.bai' not in bam]
|
|
51
|
+
# Sort file list by names and print the list of file names
|
|
52
|
+
bams.sort()
|
|
53
|
+
bam_path_list = [os.path.join(split_dir, bam) for bam in bams]
|
|
54
|
+
print(f'Found the following BAMS: {bams}')
|
|
55
|
+
final_adata = None
|
|
56
|
+
##########################################################################################
|
|
57
|
+
|
|
58
|
+
##########################################################################################
|
|
59
|
+
|
|
60
|
+
## need to fix this section
|
|
61
|
+
# Make a dictionary, keyed by modification type, that points to another dictionary of unconverted_record_ids. This points to a list of: 1) record length, 2) top strand conversion coordinates, 3) bottom strand conversion coordinates, 4) sequence string unconverted , 5) Complement sequence unconverted
|
|
62
|
+
modification_dict = {}
|
|
63
|
+
# Init a dict to be keyed by FASTA record that points to the sequence string of the unconverted record
|
|
64
|
+
record_FASTA_dict = {}
|
|
65
|
+
# While populating the dictionary, also extract the longest sequence record in the input references
|
|
66
|
+
max_reference_length = 0
|
|
67
|
+
conversions = conversion_types[1:]
|
|
68
|
+
for conversion_type in conversions:
|
|
69
|
+
# Points to a list containing: 1) sequence length of the record, 2) top strand coordinate list, 3) bottom strand coorinate list, 4) sequence string unconverted , 5) Complement sequence unconverted
|
|
70
|
+
modification_dict[conversion_type] = find_conversion_sites(converted_FASTA, conversion_type, conversion_types)
|
|
71
|
+
# Get the max reference length
|
|
72
|
+
for record in modification_dict[conversion_type].keys():
|
|
73
|
+
if modification_dict[conversion_type][record][0] > max_reference_length:
|
|
74
|
+
max_reference_length = modification_dict[conversion_type][record][0]
|
|
75
|
+
|
|
76
|
+
mod_type, strand = record.split('_')[-2:]
|
|
77
|
+
|
|
78
|
+
chromosome = record.split('_{0}_{1}'.format(mod_type, strand))[0]
|
|
79
|
+
unconverted_chromosome_name = f'{chromosome}_{conversion_types[0]}_top'
|
|
80
|
+
current_reference_length = modification_dict[mod_type][unconverted_chromosome_name][0]
|
|
81
|
+
delta_max_length = max_reference_length - current_reference_length
|
|
82
|
+
sequence = modification_dict[mod_type][unconverted_chromosome_name][3] + 'N'*delta_max_length
|
|
83
|
+
complement = modification_dict[mod_type][unconverted_chromosome_name][4] + 'N'*delta_max_length
|
|
84
|
+
record_FASTA_dict[record] = [sequence, complement, chromosome, unconverted_chromosome_name, current_reference_length, delta_max_length, conversion_type, strand]
|
|
85
|
+
##########################################################################################
|
|
86
|
+
|
|
87
|
+
##########################################################################################
|
|
88
|
+
bam_alignment_stats_dict = {}
|
|
89
|
+
records_to_analyze = []
|
|
90
|
+
for bam_index, bam in enumerate(bam_path_list):
|
|
91
|
+
bam_alignment_stats_dict[bam_index] = {}
|
|
92
|
+
# look at aligned read proportions in the bam
|
|
93
|
+
aligned_reads_count, unaligned_reads_count, record_counts = count_aligned_reads(bam)
|
|
94
|
+
percent_aligned = aligned_reads_count*100 / (aligned_reads_count+unaligned_reads_count)
|
|
95
|
+
print(f'{percent_aligned} percent of total reads in {bams[bam_index]} aligned successfully')
|
|
96
|
+
bam_alignment_stats_dict[bam_index]['Total'] = (aligned_reads_count, percent_aligned)
|
|
97
|
+
# Iterate over converted reference strands and decide which to use in the analysis based on the mapping_threshold
|
|
98
|
+
for record in record_counts:
|
|
99
|
+
print(f'{record_counts[record][0]} reads mapped to reference record {record}. This is {record_counts[record][1]*100} percent of all mapped reads in the sample.')
|
|
100
|
+
if record_counts[record][1] >= mapping_threshold:
|
|
101
|
+
records_to_analyze.append(record)
|
|
102
|
+
bam_alignment_stats_dict[bam_index]
|
|
103
|
+
bam_alignment_stats_dict[bam_index][record] = (record_counts[record][0], record_counts[record][1]*100)
|
|
104
|
+
records_to_analyze = set(records_to_analyze)
|
|
105
|
+
##########################################################################################
|
|
106
|
+
|
|
107
|
+
##########################################################################################
|
|
108
|
+
# One hot encode read sequences and write them out into the tmp_dir as h5ad files.
|
|
109
|
+
# Save the file paths in the bam_record_ohe_files dict.
|
|
110
|
+
bam_record_ohe_files = {}
|
|
111
|
+
|
|
112
|
+
# Iterate over split bams
|
|
113
|
+
for bam_index, bam in enumerate(bam_path_list):
|
|
114
|
+
# Iterate over references to process
|
|
115
|
+
for record in records_to_analyze:
|
|
116
|
+
unconverted_record_name = "_".join(record.split('_')[:-2]) + '_unconverted_top'
|
|
117
|
+
sample = bams[bam_index].split(sep=bam_suffix)[0]
|
|
118
|
+
chromosome = record_FASTA_dict[unconverted_record_name][2]
|
|
119
|
+
current_reference_length = record_FASTA_dict[unconverted_record_name][4]
|
|
120
|
+
mod_type = record_FASTA_dict[unconverted_record_name][6]
|
|
121
|
+
strand = record_FASTA_dict[unconverted_record_name][7]
|
|
122
|
+
|
|
123
|
+
# Extract the base identities of reads aligned to the record
|
|
124
|
+
fwd_base_identities, rev_base_identities = extract_base_identities(bam, record, range(current_reference_length), max_reference_length)
|
|
125
|
+
|
|
126
|
+
# binarize the dictionary of positional identities
|
|
127
|
+
print(f'Binarizing base identities')
|
|
128
|
+
fwd_binarized_base_identities = binarize_converted_base_identities(fwd_base_identities, strand, mod_type)
|
|
129
|
+
rev_binarized_base_identities = binarize_converted_base_identities(rev_base_identities, strand, mod_type)
|
|
130
|
+
merged_binarized_base_identities = {**fwd_binarized_base_identities, **rev_binarized_base_identities}
|
|
131
|
+
# converts the base identity dictionary to a dataframe.
|
|
132
|
+
binarized_base_identities_df = pd.DataFrame.from_dict(merged_binarized_base_identities, orient='index')
|
|
133
|
+
sorted_index = sorted(binarized_base_identities_df.index)
|
|
134
|
+
binarized_base_identities_df = binarized_base_identities_df.reindex(sorted_index)
|
|
135
|
+
|
|
136
|
+
# Load an anndata object with the sample data
|
|
137
|
+
X = binarized_base_identities_df.values
|
|
138
|
+
adata = ad.AnnData(X, dtype=X.dtype)
|
|
139
|
+
if adata.shape[0] > 0:
|
|
140
|
+
adata.obs_names = binarized_base_identities_df.index.astype(str)
|
|
141
|
+
adata.var_names = binarized_base_identities_df.columns.astype(str)
|
|
142
|
+
adata.obs['Sample'] = [sample] * len(adata)
|
|
143
|
+
adata.obs['Reference'] = [chromosome] * len(adata)
|
|
144
|
+
adata.obs['Strand'] = [strand] * len(adata)
|
|
145
|
+
adata.obs['Dataset'] = [mod_type] * len(adata)
|
|
146
|
+
adata.obs['Reference_dataset_strand'] = [f'{chromosome}_{mod_type}_{strand}'] * len(adata)
|
|
147
|
+
adata.obs['Reference_strand'] = [f'{record}'] * len(adata)
|
|
148
|
+
|
|
149
|
+
read_mapping_direction = []
|
|
150
|
+
for read_id in adata.obs_names:
|
|
151
|
+
if read_id in fwd_base_identities.keys():
|
|
152
|
+
read_mapping_direction.append('fwd')
|
|
153
|
+
elif read_id in rev_base_identities.keys():
|
|
154
|
+
read_mapping_direction.append('rev')
|
|
155
|
+
else:
|
|
156
|
+
read_mapping_direction.append('unk')
|
|
157
|
+
|
|
158
|
+
adata.obs['Read_mapping_direction'] = read_mapping_direction
|
|
159
|
+
|
|
160
|
+
# One hot encode the sequence string of the reads
|
|
161
|
+
fwd_ohe_files = ohe_batching(fwd_base_identities, tmp_dir, record, f"{bam_index}_fwd",batch_size=100000)
|
|
162
|
+
rev_ohe_files = ohe_batching(rev_base_identities, tmp_dir, record, f"{bam_index}_rev",batch_size=100000)
|
|
163
|
+
bam_record_ohe_files[f'{bam_index}_{record}'] = fwd_ohe_files + rev_ohe_files
|
|
164
|
+
del fwd_base_identities, rev_base_identities
|
|
165
|
+
|
|
166
|
+
one_hot_reads = {}
|
|
167
|
+
n_rows_OHE = 5
|
|
168
|
+
for ohe_file in tqdm(bam_record_ohe_files[f'{bam_index}_{record}'], desc="Reading in OHE reads"):
|
|
169
|
+
tmp_ohe_dict = ad.read_h5ad(ohe_file).uns
|
|
170
|
+
one_hot_reads.update(tmp_ohe_dict)
|
|
171
|
+
del tmp_ohe_dict
|
|
172
|
+
|
|
173
|
+
read_names = list(one_hot_reads.keys())
|
|
174
|
+
|
|
175
|
+
sequence_length = one_hot_reads[read_names[0]].reshape(n_rows_OHE, -1).shape[1]
|
|
176
|
+
df_A = np.zeros((len(sorted_index), sequence_length), dtype=int)
|
|
177
|
+
df_C = np.zeros((len(sorted_index), sequence_length), dtype=int)
|
|
178
|
+
df_G = np.zeros((len(sorted_index), sequence_length), dtype=int)
|
|
179
|
+
df_T = np.zeros((len(sorted_index), sequence_length), dtype=int)
|
|
180
|
+
df_N = np.zeros((len(sorted_index), sequence_length), dtype=int)
|
|
181
|
+
|
|
182
|
+
# Process one-hot data into dictionaries
|
|
183
|
+
dict_A, dict_C, dict_G, dict_T, dict_N = {}, {}, {}, {}, {}
|
|
184
|
+
for read_name, one_hot_array in one_hot_reads.items():
|
|
185
|
+
one_hot_array = one_hot_array.reshape(n_rows_OHE, -1)
|
|
186
|
+
dict_A[read_name] = one_hot_array[0, :]
|
|
187
|
+
dict_C[read_name] = one_hot_array[1, :]
|
|
188
|
+
dict_G[read_name] = one_hot_array[2, :]
|
|
189
|
+
dict_T[read_name] = one_hot_array[3, :]
|
|
190
|
+
dict_N[read_name] = one_hot_array[4, :]
|
|
191
|
+
|
|
192
|
+
del one_hot_reads
|
|
193
|
+
gc.collect()
|
|
194
|
+
|
|
195
|
+
# Fill the arrays
|
|
196
|
+
for j, read_name in tqdm(enumerate(sorted_index), desc='Loading arrays of OHE reads', total=len(sorted_index)):
|
|
197
|
+
df_A[j, :] = dict_A[read_name]
|
|
198
|
+
df_C[j, :] = dict_C[read_name]
|
|
199
|
+
df_G[j, :] = dict_G[read_name]
|
|
200
|
+
df_T[j, :] = dict_T[read_name]
|
|
201
|
+
df_N[j, :] = dict_N[read_name]
|
|
202
|
+
|
|
203
|
+
del dict_A, dict_C, dict_G, dict_T, dict_N
|
|
204
|
+
gc.collect()
|
|
205
|
+
|
|
206
|
+
# Store the results in AnnData layers
|
|
207
|
+
ohe_df_map = {0: df_A, 1: df_C, 2: df_G, 3: df_T, 4: df_N}
|
|
208
|
+
for j, base in enumerate(['A', 'C', 'G', 'T', 'N']):
|
|
209
|
+
adata.layers[f'{base}_binary_encoding'] = ohe_df_map[j]
|
|
210
|
+
ohe_df_map[j] = None # Reassign pointer for memory usage purposes
|
|
211
|
+
|
|
212
|
+
if final_adata:
|
|
213
|
+
if adata.shape[0] > 0:
|
|
214
|
+
final_adata = ad.concat([final_adata, adata], join='outer', index_unique=None)
|
|
215
|
+
else:
|
|
216
|
+
print(f"{sample} did not have any mapped reads on {record}, omiting from final adata")
|
|
217
|
+
else:
|
|
218
|
+
if adata.shape[0] > 0:
|
|
219
|
+
final_adata = adata
|
|
220
|
+
else:
|
|
221
|
+
print(f"{sample} did not have any mapped reads on {record}, omiting from final adata")
|
|
222
|
+
|
|
223
|
+
else:
|
|
224
|
+
print(f"{sample} did not have any mapped reads on {record}, omiting from final adata")
|
|
225
|
+
|
|
226
|
+
# Set obs columns to type 'category'
|
|
227
|
+
for col in final_adata.obs.columns:
|
|
228
|
+
final_adata.obs[col] = final_adata.obs[col].astype('category')
|
|
229
|
+
|
|
230
|
+
for record in records_to_analyze:
|
|
231
|
+
unconverted_record_name = "_".join(record.split('_')[:-2]) + '_unconverted_top'
|
|
232
|
+
sequence = record_FASTA_dict[unconverted_record_name][0]
|
|
233
|
+
complement = record_FASTA_dict[unconverted_record_name][1]
|
|
234
|
+
chromosome = record_FASTA_dict[unconverted_record_name][2]
|
|
235
|
+
final_adata.var[f'{chromosome}_unconverted_top_strand_FASTA_base'] = list(sequence)
|
|
236
|
+
final_adata.var[f'{chromosome}_unconverted_bottom_strand_FASTA_base'] = list(complement)
|
|
237
|
+
final_adata.uns[f'{chromosome}_FASTA_sequence'] = sequence
|
|
238
|
+
|
|
239
|
+
######################################################################################################
|
|
240
|
+
|
|
241
|
+
######################################################################################################
|
|
242
|
+
## Export the final adata object
|
|
243
|
+
print('Saving initial draft of final adata')
|
|
244
|
+
final_adata.write_h5ad(final_adata_path)
|
|
245
|
+
return final_adata_path
|