smftools 0.1.3__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +5 -1
- smftools/_version.py +1 -1
- smftools/informatics/__init__.py +2 -0
- smftools/informatics/archived/print_bam_query_seq.py +29 -0
- smftools/informatics/basecall_pod5s.py +80 -0
- smftools/informatics/conversion_smf.py +63 -10
- smftools/informatics/direct_smf.py +66 -18
- smftools/informatics/helpers/LoadExperimentConfig.py +1 -0
- smftools/informatics/helpers/__init__.py +16 -2
- smftools/informatics/helpers/align_and_sort_BAM.py +27 -16
- smftools/informatics/helpers/aligned_BAM_to_bed.py +49 -48
- smftools/informatics/helpers/bam_qc.py +66 -0
- smftools/informatics/helpers/binarize_converted_base_identities.py +69 -21
- smftools/informatics/helpers/canoncall.py +12 -3
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +5 -4
- smftools/informatics/helpers/converted_BAM_to_adata.py +34 -22
- smftools/informatics/helpers/converted_BAM_to_adata_II.py +369 -0
- smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
- smftools/informatics/helpers/extract_base_identities.py +33 -46
- smftools/informatics/helpers/extract_mods.py +55 -23
- smftools/informatics/helpers/extract_read_features_from_bam.py +31 -0
- smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
- smftools/informatics/helpers/find_conversion_sites.py +33 -44
- smftools/informatics/helpers/generate_converted_FASTA.py +87 -86
- smftools/informatics/helpers/modcall.py +13 -5
- smftools/informatics/helpers/modkit_extract_to_adata.py +762 -396
- smftools/informatics/helpers/ohe_batching.py +65 -41
- smftools/informatics/helpers/ohe_layers_decode.py +32 -0
- smftools/informatics/helpers/one_hot_decode.py +27 -0
- smftools/informatics/helpers/one_hot_encode.py +45 -9
- smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +1 -0
- smftools/informatics/helpers/run_multiqc.py +28 -0
- smftools/informatics/helpers/split_and_index_BAM.py +3 -8
- smftools/informatics/load_adata.py +58 -3
- smftools/plotting/__init__.py +15 -0
- smftools/plotting/classifiers.py +355 -0
- smftools/plotting/general_plotting.py +205 -0
- smftools/plotting/position_stats.py +462 -0
- smftools/preprocessing/__init__.py +6 -7
- smftools/preprocessing/append_C_context.py +22 -9
- smftools/preprocessing/{mark_duplicates.py → archives/mark_duplicates.py} +38 -26
- smftools/preprocessing/binarize_on_Youden.py +35 -32
- smftools/preprocessing/binary_layers_to_ohe.py +13 -3
- smftools/preprocessing/calculate_complexity.py +3 -2
- smftools/preprocessing/calculate_converted_read_methylation_stats.py +44 -46
- smftools/preprocessing/calculate_coverage.py +26 -25
- smftools/preprocessing/calculate_pairwise_differences.py +49 -0
- smftools/preprocessing/calculate_position_Youden.py +18 -7
- smftools/preprocessing/calculate_read_length_stats.py +39 -46
- smftools/preprocessing/clean_NaN.py +33 -25
- smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
- smftools/preprocessing/filter_converted_reads_on_methylation.py +20 -5
- smftools/preprocessing/filter_reads_on_length.py +14 -4
- smftools/preprocessing/flag_duplicate_reads.py +149 -0
- smftools/preprocessing/invert_adata.py +18 -11
- smftools/preprocessing/load_sample_sheet.py +30 -16
- smftools/preprocessing/recipes.py +22 -20
- smftools/preprocessing/subsample_adata.py +58 -0
- smftools/readwrite.py +105 -13
- smftools/tools/__init__.py +49 -0
- smftools/tools/apply_hmm.py +202 -0
- smftools/tools/apply_hmm_batched.py +241 -0
- smftools/tools/archived/classify_methylated_features.py +66 -0
- smftools/tools/archived/classify_non_methylated_features.py +75 -0
- smftools/tools/archived/subset_adata_v1.py +32 -0
- smftools/tools/archived/subset_adata_v2.py +46 -0
- smftools/tools/calculate_distances.py +18 -0
- smftools/tools/calculate_umap.py +62 -0
- smftools/tools/call_hmm_peaks.py +105 -0
- smftools/tools/classifiers.py +787 -0
- smftools/tools/cluster_adata_on_methylation.py +105 -0
- smftools/tools/data/__init__.py +2 -0
- smftools/tools/data/anndata_data_module.py +90 -0
- smftools/tools/data/preprocessing.py +6 -0
- smftools/tools/display_hmm.py +18 -0
- smftools/tools/general_tools.py +69 -0
- smftools/tools/hmm_readwrite.py +16 -0
- smftools/tools/inference/__init__.py +1 -0
- smftools/tools/inference/lightning_inference.py +41 -0
- smftools/tools/models/__init__.py +9 -0
- smftools/tools/models/base.py +14 -0
- smftools/tools/models/cnn.py +34 -0
- smftools/tools/models/lightning_base.py +41 -0
- smftools/tools/models/mlp.py +17 -0
- smftools/tools/models/positional.py +17 -0
- smftools/tools/models/rnn.py +16 -0
- smftools/tools/models/sklearn_models.py +40 -0
- smftools/tools/models/transformer.py +133 -0
- smftools/tools/models/wrappers.py +20 -0
- smftools/tools/nucleosome_hmm_refinement.py +104 -0
- smftools/tools/position_stats.py +239 -0
- smftools/tools/read_stats.py +70 -0
- smftools/tools/subset_adata.py +19 -23
- smftools/tools/train_hmm.py +78 -0
- smftools/tools/training/__init__.py +1 -0
- smftools/tools/training/train_lightning_model.py +47 -0
- smftools/tools/utils/__init__.py +2 -0
- smftools/tools/utils/device.py +10 -0
- smftools/tools/utils/grl.py +14 -0
- {smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/METADATA +47 -11
- smftools-0.1.7.dist-info/RECORD +136 -0
- smftools/tools/apply_HMM.py +0 -1
- smftools/tools/read_HMM.py +0 -1
- smftools/tools/train_HMM.py +0 -43
- smftools-0.1.3.dist-info/RECORD +0 -84
- /smftools/preprocessing/{remove_duplicates.py → archives/remove_duplicates.py} +0 -0
- /smftools/tools/{cluster.py → evaluation/__init__.py} +0 -0
- {smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/WHEEL +0 -0
- {smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,73 +1,74 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
def aligned_BAM_to_bed(aligned_BAM, plotting_dir, bed_dir, fasta):
|
|
1
|
+
def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
|
|
4
2
|
"""
|
|
5
|
-
Takes an aligned BAM as input and writes a
|
|
6
|
-
Bed columns are: Record name, start position, end position, read length, read name
|
|
3
|
+
Takes an aligned BAM as input and writes a BED file of reads as output.
|
|
4
|
+
Bed columns are: Record name, start position, end position, read length, read name.
|
|
7
5
|
|
|
8
6
|
Parameters:
|
|
9
7
|
aligned_BAM (str): Path to an input aligned_BAM to extract to a BED file.
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
8
|
+
out_dir (str): Directory to output files.
|
|
9
|
+
fasta (str): File path to the reference genome.
|
|
10
|
+
make_bigwigs (bool): Whether to generate bigwig files.
|
|
11
|
+
threads (int): Number of threads to use.
|
|
13
12
|
|
|
14
13
|
Returns:
|
|
15
14
|
None
|
|
16
|
-
|
|
17
15
|
"""
|
|
18
16
|
import subprocess
|
|
19
17
|
import os
|
|
18
|
+
import concurrent.futures
|
|
19
|
+
from concurrent.futures import ProcessPoolExecutor
|
|
20
20
|
from .bed_to_bigwig import bed_to_bigwig
|
|
21
|
+
from . import make_dirs
|
|
21
22
|
from .plot_read_length_and_coverage_histograms import plot_read_length_and_coverage_histograms
|
|
22
23
|
|
|
23
|
-
|
|
24
|
-
|
|
24
|
+
threads = threads or os.cpu_count() # Use max available cores if not specified
|
|
25
|
+
|
|
26
|
+
# Create necessary directories
|
|
27
|
+
plotting_dir = os.path.join(out_dir, "bed_cov_histograms")
|
|
28
|
+
bed_dir = os.path.join(out_dir, "beds")
|
|
29
|
+
make_dirs([plotting_dir, bed_dir])
|
|
25
30
|
|
|
26
|
-
|
|
31
|
+
bed_output = os.path.join(bed_dir, os.path.basename(aligned_BAM).replace(".bam", "_bed.bed"))
|
|
32
|
+
|
|
33
|
+
print(f"Creating BED from BAM: {aligned_BAM} using {threads} threads...")
|
|
34
|
+
|
|
35
|
+
# Convert BAM to BED format
|
|
27
36
|
with open(bed_output, "w") as output_file:
|
|
28
|
-
|
|
37
|
+
samtools_view = subprocess.Popen(["samtools", "view", "-@", str(threads), aligned_BAM], stdout=subprocess.PIPE)
|
|
38
|
+
awk_process = subprocess.Popen(
|
|
39
|
+
["awk", '{print $3 "\t" $4 "\t" $4+length($10)-1 "\t" length($10)-1 "\t" $1}'],
|
|
40
|
+
stdin=samtools_view.stdout,
|
|
41
|
+
stdout=output_file
|
|
42
|
+
)
|
|
43
|
+
|
|
29
44
|
samtools_view.stdout.close()
|
|
30
45
|
awk_process.wait()
|
|
31
46
|
samtools_view.wait()
|
|
32
47
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
Returns:
|
|
42
|
-
aligned (str): Path to the aligned bed file
|
|
43
|
-
"""
|
|
44
|
-
unaligned = bed.split('.bed')[0] + '_unaligned.bed'
|
|
45
|
-
aligned = bed.split('.bed')[0] + '_aligned.bed'
|
|
46
|
-
|
|
47
|
-
with open(bed, 'r') as infile, \
|
|
48
|
-
open(unaligned, 'w') as unaligned_outfile, \
|
|
49
|
-
open(aligned, 'w') as aligned_outfile:
|
|
50
|
-
|
|
48
|
+
print(f"BED file created: {bed_output}")
|
|
49
|
+
|
|
50
|
+
def split_bed(bed):
|
|
51
|
+
"""Splits BED into aligned and unaligned reads."""
|
|
52
|
+
aligned = bed.replace(".bed", "_aligned.bed")
|
|
53
|
+
unaligned = bed.replace(".bed", "_unaligned.bed")
|
|
54
|
+
|
|
55
|
+
with open(bed, "r") as infile, open(aligned, "w") as aligned_out, open(unaligned, "w") as unaligned_out:
|
|
51
56
|
for line in infile:
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
if fields[0] == '*':
|
|
55
|
-
unaligned_outfile.write(line)
|
|
56
|
-
else:
|
|
57
|
-
aligned_outfile.write(line)
|
|
58
|
-
|
|
59
|
-
if delete_input:
|
|
60
|
-
os.remove(bed)
|
|
61
|
-
|
|
62
|
-
return aligned
|
|
63
|
-
|
|
64
|
-
aligned_bed = split_bed(bed_output)
|
|
57
|
+
(unaligned_out if line.startswith("*") else aligned_out).write(line)
|
|
65
58
|
|
|
66
|
-
|
|
67
|
-
|
|
59
|
+
os.remove(bed)
|
|
60
|
+
return aligned
|
|
68
61
|
|
|
69
|
-
|
|
70
|
-
|
|
62
|
+
print(f"Splitting BED: {bed_output}")
|
|
63
|
+
aligned_bed = split_bed(bed_output)
|
|
71
64
|
|
|
65
|
+
with ProcessPoolExecutor() as executor: # Use processes instead of threads
|
|
66
|
+
futures = []
|
|
67
|
+
futures.append(executor.submit(plot_read_length_and_coverage_histograms, aligned_bed, plotting_dir))
|
|
68
|
+
if make_bigwigs:
|
|
69
|
+
futures.append(executor.submit(bed_to_bigwig, fasta, aligned_bed))
|
|
72
70
|
|
|
71
|
+
# Wait for all tasks to complete
|
|
72
|
+
concurrent.futures.wait(futures)
|
|
73
73
|
|
|
74
|
+
print("Processing completed successfully.")
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
## bam_qc
|
|
2
|
+
|
|
3
|
+
def bam_qc(bam_files, bam_qc_dir, threads, modality, stats=True, flagstats=True, idxstats=True):
|
|
4
|
+
"""
|
|
5
|
+
Performs QC on BAM files by running samtools stats, flagstat, and idxstats.
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
- bam_files: List of BAM file paths.
|
|
9
|
+
- bam_qc_dir: Directory to save QC reports.
|
|
10
|
+
- threads: Number threads to use.
|
|
11
|
+
- modality: 'conversion' or 'direct' (affects processing mode).
|
|
12
|
+
- stats: Run `samtools stats` if True.
|
|
13
|
+
- flagstats: Run `samtools flagstat` if True.
|
|
14
|
+
- idxstats: Run `samtools idxstats` if True.
|
|
15
|
+
"""
|
|
16
|
+
import os
|
|
17
|
+
import subprocess
|
|
18
|
+
|
|
19
|
+
# Ensure the QC output directory exists
|
|
20
|
+
os.makedirs(bam_qc_dir, exist_ok=True)
|
|
21
|
+
|
|
22
|
+
if threads:
|
|
23
|
+
threads = str(threads)
|
|
24
|
+
else:
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
for bam in bam_files:
|
|
28
|
+
bam_name = os.path.basename(bam).replace(".bam", "") # Extract filename without extension
|
|
29
|
+
|
|
30
|
+
# Run samtools QC commands based on selected options
|
|
31
|
+
if stats:
|
|
32
|
+
stats_out = os.path.join(bam_qc_dir, f"{bam_name}_stats.txt")
|
|
33
|
+
if threads:
|
|
34
|
+
command = ["samtools", "stats", "-@", threads, bam]
|
|
35
|
+
else:
|
|
36
|
+
command = ["samtools", "stats", bam]
|
|
37
|
+
print(f"Running: {' '.join(command)} > {stats_out}")
|
|
38
|
+
with open(stats_out, "w") as out_file:
|
|
39
|
+
subprocess.run(command, stdout=out_file)
|
|
40
|
+
|
|
41
|
+
if flagstats:
|
|
42
|
+
flagstats_out = os.path.join(bam_qc_dir, f"{bam_name}_flagstat.txt")
|
|
43
|
+
if threads:
|
|
44
|
+
command = ["samtools", "flagstat", "-@", threads, bam]
|
|
45
|
+
else:
|
|
46
|
+
command = ["samtools", "flagstat", bam]
|
|
47
|
+
print(f"Running: {' '.join(command)} > {flagstats_out}")
|
|
48
|
+
with open(flagstats_out, "w") as out_file:
|
|
49
|
+
subprocess.run(command, stdout=out_file)
|
|
50
|
+
|
|
51
|
+
if idxstats:
|
|
52
|
+
idxstats_out = os.path.join(bam_qc_dir, f"{bam_name}_idxstats.txt")
|
|
53
|
+
if threads:
|
|
54
|
+
command = ["samtools", "idxstats", "-@", threads, bam]
|
|
55
|
+
else:
|
|
56
|
+
command = ["samtools", "idxstats", bam]
|
|
57
|
+
print(f"Running: {' '.join(command)} > {idxstats_out}")
|
|
58
|
+
with open(idxstats_out, "w") as out_file:
|
|
59
|
+
subprocess.run(command, stdout=out_file)
|
|
60
|
+
|
|
61
|
+
if modality == 'conversion':
|
|
62
|
+
pass
|
|
63
|
+
elif modality == 'direct':
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
print("QC processing completed.")
|
|
@@ -1,31 +1,79 @@
|
|
|
1
|
-
|
|
2
|
-
# Conversion SMF specific
|
|
3
|
-
def binarize_converted_base_identities(base_identities, strand, modification_type):
|
|
1
|
+
def binarize_converted_base_identities(base_identities, strand, modification_type, bam, device='cpu'):
|
|
4
2
|
"""
|
|
5
|
-
|
|
3
|
+
Efficiently binarizes conversion SMF data within a sequence string using NumPy arrays.
|
|
6
4
|
|
|
7
5
|
Parameters:
|
|
8
6
|
base_identities (dict): A dictionary returned by extract_base_identities. Keyed by read name. Points to a list of base identities.
|
|
9
7
|
strand (str): A string indicating which strand was converted in the experiment (options are 'top' and 'bottom').
|
|
10
8
|
modification_type (str): A string indicating the modification type of interest (options are '5mC' and '6mA').
|
|
11
|
-
|
|
9
|
+
bam (str): The bam file path
|
|
10
|
+
|
|
12
11
|
Returns:
|
|
13
|
-
|
|
12
|
+
dict: A dictionary where 1 represents a methylated site, 0 represents an unmethylated site, and NaN represents a site without methylation info.
|
|
14
13
|
"""
|
|
15
14
|
import numpy as np
|
|
15
|
+
|
|
16
|
+
# If the modification type is 'unconverted', return NaN for all positions
|
|
17
|
+
if modification_type == "unconverted":
|
|
18
|
+
#print(f"Skipping binarization for unconverted {strand} reads on bam: {bam}.")
|
|
19
|
+
return {key: np.full(len(bases), np.nan) for key, bases in base_identities.items()}
|
|
20
|
+
|
|
21
|
+
# Define mappings for binarization based on strand and modification type
|
|
22
|
+
binarization_maps = {
|
|
23
|
+
('top', '5mC'): {'C': 1, 'T': 0},
|
|
24
|
+
('top', '6mA'): {'A': 1, 'G': 0},
|
|
25
|
+
('bottom', '5mC'): {'G': 1, 'A': 0},
|
|
26
|
+
('bottom', '6mA'): {'T': 1, 'C': 0}
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
if (strand, modification_type) not in binarization_maps:
|
|
30
|
+
raise ValueError(f"Invalid combination of strand='{strand}' and modification_type='{modification_type}'")
|
|
31
|
+
|
|
32
|
+
# Fetch the appropriate mapping
|
|
33
|
+
base_map = binarization_maps[(strand, modification_type)]
|
|
34
|
+
|
|
16
35
|
binarized_base_identities = {}
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
36
|
+
for key, bases in base_identities.items():
|
|
37
|
+
arr = np.array(bases, dtype='<U1')
|
|
38
|
+
binarized = np.vectorize(lambda x: base_map.get(x, np.nan))(arr) # Apply mapping with fallback to NaN
|
|
39
|
+
binarized_base_identities[key] = binarized
|
|
40
|
+
|
|
41
|
+
return binarized_base_identities
|
|
42
|
+
# import torch
|
|
43
|
+
|
|
44
|
+
# # If the modification type is 'unconverted', return NaN for all positions
|
|
45
|
+
# if modification_type == "unconverted":
|
|
46
|
+
# print(f"Skipping binarization for unconverted {strand} reads on bam: {bam}.")
|
|
47
|
+
# return {key: torch.full((len(bases),), float('nan'), device=device) for key, bases in base_identities.items()}
|
|
48
|
+
|
|
49
|
+
# # Define mappings for binarization based on strand and modification type
|
|
50
|
+
# binarization_maps = {
|
|
51
|
+
# ('top', '5mC'): {'C': 1, 'T': 0},
|
|
52
|
+
# ('top', '6mA'): {'A': 1, 'G': 0},
|
|
53
|
+
# ('bottom', '5mC'): {'G': 1, 'A': 0},
|
|
54
|
+
# ('bottom', '6mA'): {'T': 1, 'C': 0}
|
|
55
|
+
# }
|
|
56
|
+
|
|
57
|
+
# if (strand, modification_type) not in binarization_maps:
|
|
58
|
+
# raise ValueError(f"Invalid combination of strand='{strand}' and modification_type='{modification_type}'")
|
|
59
|
+
|
|
60
|
+
# # Fetch the appropriate mapping
|
|
61
|
+
# base_map = binarization_maps[(strand, modification_type)]
|
|
62
|
+
|
|
63
|
+
# # Convert mapping to tensor
|
|
64
|
+
# base_keys = list(base_map.keys())
|
|
65
|
+
# base_values = torch.tensor(list(base_map.values()), dtype=torch.float32, device=device)
|
|
66
|
+
|
|
67
|
+
# # Create a lookup dictionary (ASCII-based for fast mapping)
|
|
68
|
+
# lookup_table = torch.full((256,), float('nan'), dtype=torch.float32, device=device)
|
|
69
|
+
# for k, v in zip(base_keys, base_values):
|
|
70
|
+
# lookup_table[ord(k)] = v
|
|
71
|
+
|
|
72
|
+
# # Process reads
|
|
73
|
+
# binarized_base_identities = {}
|
|
74
|
+
# for key, bases in base_identities.items():
|
|
75
|
+
# bases_tensor = torch.tensor([ord(c) for c in bases], dtype=torch.uint8, device=device) # Convert chars to ASCII
|
|
76
|
+
# binarized = lookup_table[bases_tensor] # Efficient lookup
|
|
77
|
+
# binarized_base_identities[key] = binarized
|
|
78
|
+
|
|
79
|
+
# return binarized_base_identities
|
|
@@ -1,16 +1,20 @@
|
|
|
1
1
|
## canoncall
|
|
2
2
|
|
|
3
3
|
# Conversion SMF specific
|
|
4
|
-
def canoncall(model, pod5_dir, barcode_kit, bam, bam_suffix):
|
|
4
|
+
def canoncall(model_dir, model, pod5_dir, barcode_kit, bam, bam_suffix, barcode_both_ends=True, trim=False, device='auto'):
|
|
5
5
|
"""
|
|
6
6
|
Wrapper function for dorado canonical base calling.
|
|
7
7
|
|
|
8
8
|
Parameters:
|
|
9
|
-
|
|
9
|
+
model_dir (str): a string representing the file path to the dorado basecalling model directory.
|
|
10
|
+
model (str): a string representing the the dorado basecalling model.
|
|
10
11
|
pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
|
|
11
12
|
barcode_kit (str): A string reppresenting the barcoding kit used in the experiment.
|
|
12
13
|
bam (str): File path to the BAM file to output.
|
|
13
14
|
bam_suffix (str): The suffix to use for the BAM file.
|
|
15
|
+
barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
|
|
16
|
+
trim (bool): Whether to trim barcodes, adapters, and primers from read ends.
|
|
17
|
+
device (str): The device to use. 'auto' is default, which can detect device to use. Can also specify metal, cpu, cuda.
|
|
14
18
|
|
|
15
19
|
Returns:
|
|
16
20
|
None
|
|
@@ -18,7 +22,12 @@ def canoncall(model, pod5_dir, barcode_kit, bam, bam_suffix):
|
|
|
18
22
|
"""
|
|
19
23
|
import subprocess
|
|
20
24
|
output = bam + bam_suffix
|
|
21
|
-
command = ["dorado", "basecaller",
|
|
25
|
+
command = ["dorado", "basecaller", "--models-directory", model_dir, "--kit-name", barcode_kit, "--device", device, "--batchsize", "0"]
|
|
26
|
+
if barcode_both_ends:
|
|
27
|
+
command.append("--barcode-both-ends")
|
|
28
|
+
if not trim:
|
|
29
|
+
command.append("--no-trim")
|
|
30
|
+
command += [model, pod5_dir]
|
|
22
31
|
command_string = " ".join(command)
|
|
23
32
|
print(f"Running {command_string}\n to generate {output}")
|
|
24
33
|
with open(output, "w") as outfile:
|
|
@@ -36,7 +36,9 @@ def concatenate_fastqs_to_bam(fastq_files, output_bam, barcode_tag='BC', gzip_su
|
|
|
36
36
|
barcode = base_name.split('_')[-1].replace('.fq', '')
|
|
37
37
|
else:
|
|
38
38
|
raise ValueError(f"Unexpected file extension for {fastq_file}. Only .fq, .fastq, .fq{gzip_suffix}, and .fastq{gzip_suffix} are supported.")
|
|
39
|
-
|
|
39
|
+
else:
|
|
40
|
+
barcode = 'barcode0'
|
|
41
|
+
|
|
40
42
|
# Read the FASTQ file (handle gzipped and non-gzipped files)
|
|
41
43
|
open_func = gzip.open if fastq_file.endswith(gzip_suffix) else open
|
|
42
44
|
with open_func(fastq_file, 'rt') as fq_in:
|
|
@@ -47,8 +49,7 @@ def concatenate_fastqs_to_bam(fastq_files, output_bam, barcode_tag='BC', gzip_su
|
|
|
47
49
|
aln.query_sequence = str(record.seq)
|
|
48
50
|
aln.flag = 4 # Unmapped
|
|
49
51
|
aln.query_qualities = pysam.qualitystring_to_array(record.letter_annotations["phred_quality"])
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
aln.set_tag(barcode_tag, barcode)
|
|
52
|
+
# Add the barcode to the BC tag
|
|
53
|
+
aln.set_tag(barcode_tag, barcode)
|
|
53
54
|
# Write to BAM file
|
|
54
55
|
bam_out.write(aln)
|
|
@@ -13,7 +13,7 @@ def converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experi
|
|
|
13
13
|
bam_suffix (str): The suffix to use for the BAM file.
|
|
14
14
|
|
|
15
15
|
Returns:
|
|
16
|
-
|
|
16
|
+
final_adata_path (str): File path to the final adata object
|
|
17
17
|
Outputs a single gzipped adata object for the experiment.
|
|
18
18
|
"""
|
|
19
19
|
from .. import readwrite
|
|
@@ -36,7 +36,14 @@ def converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experi
|
|
|
36
36
|
files = os.listdir(split_dir)
|
|
37
37
|
# Make output dir
|
|
38
38
|
parent_dir = os.path.dirname(split_dir)
|
|
39
|
+
split_dir_base = os.path.basename(split_dir)
|
|
39
40
|
h5_dir = os.path.join(parent_dir, 'h5ads')
|
|
41
|
+
final_adata_path = os.path.join(h5_dir, f'{experiment_name}_{split_dir_base}.h5ad')
|
|
42
|
+
|
|
43
|
+
if os.path.exists(f"{final_adata_path}.gz"):
|
|
44
|
+
print(f'{final_adata_path}.gz already exists, using existing adata object') # Stops here if the final_adata file already exists
|
|
45
|
+
return final_adata_path
|
|
46
|
+
|
|
40
47
|
tmp_dir = os.path.join(parent_dir, 'tmp')
|
|
41
48
|
make_dirs([h5_dir, tmp_dir])
|
|
42
49
|
# Filter file names that contain the search string in their filename and keep them in a list
|
|
@@ -57,7 +64,8 @@ def converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experi
|
|
|
57
64
|
record_FASTA_dict = {}
|
|
58
65
|
# While populating the dictionary, also extract the longest sequence record in the input references
|
|
59
66
|
max_reference_length = 0
|
|
60
|
-
|
|
67
|
+
conversions = conversion_types[1:]
|
|
68
|
+
for conversion_type in conversions:
|
|
61
69
|
# Points to a list containing: 1) sequence length of the record, 2) top strand coordinate list, 3) bottom strand coorinate list, 4) sequence string unconverted , 5) Complement sequence unconverted
|
|
62
70
|
modification_dict[conversion_type] = find_conversion_sites(converted_FASTA, conversion_type, conversion_types)
|
|
63
71
|
# Get the max reference length
|
|
@@ -132,10 +140,11 @@ def converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experi
|
|
|
132
140
|
adata.obs_names = binarized_base_identities_df.index.astype(str)
|
|
133
141
|
adata.var_names = binarized_base_identities_df.columns.astype(str)
|
|
134
142
|
adata.obs['Sample'] = [sample] * len(adata)
|
|
143
|
+
adata.obs['Reference'] = [chromosome] * len(adata)
|
|
135
144
|
adata.obs['Strand'] = [strand] * len(adata)
|
|
136
145
|
adata.obs['Dataset'] = [mod_type] * len(adata)
|
|
137
|
-
adata.obs['
|
|
138
|
-
adata.obs['
|
|
146
|
+
adata.obs['Reference_dataset_strand'] = [f'{chromosome}_{mod_type}_{strand}'] * len(adata)
|
|
147
|
+
adata.obs['Reference_strand'] = [f'{record}'] * len(adata)
|
|
139
148
|
|
|
140
149
|
read_mapping_direction = []
|
|
141
150
|
for read_id in adata.obs_names:
|
|
@@ -162,15 +171,16 @@ def converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experi
|
|
|
162
171
|
del tmp_ohe_dict
|
|
163
172
|
|
|
164
173
|
read_names = list(one_hot_reads.keys())
|
|
165
|
-
dict_A, dict_C, dict_G, dict_T, dict_N = {}, {}, {}, {}, {}
|
|
166
174
|
|
|
167
175
|
sequence_length = one_hot_reads[read_names[0]].reshape(n_rows_OHE, -1).shape[1]
|
|
168
|
-
df_A =
|
|
169
|
-
df_C =
|
|
170
|
-
df_G =
|
|
171
|
-
df_T =
|
|
172
|
-
df_N =
|
|
176
|
+
df_A = np.zeros((len(sorted_index), sequence_length), dtype=int)
|
|
177
|
+
df_C = np.zeros((len(sorted_index), sequence_length), dtype=int)
|
|
178
|
+
df_G = np.zeros((len(sorted_index), sequence_length), dtype=int)
|
|
179
|
+
df_T = np.zeros((len(sorted_index), sequence_length), dtype=int)
|
|
180
|
+
df_N = np.zeros((len(sorted_index), sequence_length), dtype=int)
|
|
173
181
|
|
|
182
|
+
# Process one-hot data into dictionaries
|
|
183
|
+
dict_A, dict_C, dict_G, dict_T, dict_N = {}, {}, {}, {}, {}
|
|
174
184
|
for read_name, one_hot_array in one_hot_reads.items():
|
|
175
185
|
one_hot_array = one_hot_array.reshape(n_rows_OHE, -1)
|
|
176
186
|
dict_A[read_name] = one_hot_array[0, :]
|
|
@@ -182,21 +192,22 @@ def converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experi
|
|
|
182
192
|
del one_hot_reads
|
|
183
193
|
gc.collect()
|
|
184
194
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
195
|
+
# Fill the arrays
|
|
196
|
+
for j, read_name in tqdm(enumerate(sorted_index), desc='Loading arrays of OHE reads', total=len(sorted_index)):
|
|
197
|
+
df_A[j, :] = dict_A[read_name]
|
|
198
|
+
df_C[j, :] = dict_C[read_name]
|
|
199
|
+
df_G[j, :] = dict_G[read_name]
|
|
200
|
+
df_T[j, :] = dict_T[read_name]
|
|
201
|
+
df_N[j, :] = dict_N[read_name]
|
|
191
202
|
|
|
192
203
|
del dict_A, dict_C, dict_G, dict_T, dict_N
|
|
193
204
|
gc.collect()
|
|
194
205
|
|
|
206
|
+
# Store the results in AnnData layers
|
|
195
207
|
ohe_df_map = {0: df_A, 1: df_C, 2: df_G, 3: df_T, 4: df_N}
|
|
196
|
-
|
|
197
208
|
for j, base in enumerate(['A', 'C', 'G', 'T', 'N']):
|
|
198
|
-
adata.layers[f'{base}_binary_encoding'] = ohe_df_map[j]
|
|
199
|
-
ohe_df_map[j] = None
|
|
209
|
+
adata.layers[f'{base}_binary_encoding'] = ohe_df_map[j]
|
|
210
|
+
ohe_df_map[j] = None # Reassign pointer for memory usage purposes
|
|
200
211
|
|
|
201
212
|
if final_adata:
|
|
202
213
|
if adata.shape[0] > 0:
|
|
@@ -223,11 +234,12 @@ def converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experi
|
|
|
223
234
|
chromosome = record_FASTA_dict[unconverted_record_name][2]
|
|
224
235
|
final_adata.var[f'{chromosome}_unconverted_top_strand_FASTA_base'] = list(sequence)
|
|
225
236
|
final_adata.var[f'{chromosome}_unconverted_bottom_strand_FASTA_base'] = list(complement)
|
|
226
|
-
final_adata.uns[f'{
|
|
237
|
+
final_adata.uns[f'{chromosome}_FASTA_sequence'] = sequence
|
|
227
238
|
|
|
228
239
|
######################################################################################################
|
|
229
240
|
|
|
230
241
|
######################################################################################################
|
|
231
242
|
## Export the final adata object
|
|
232
|
-
|
|
233
|
-
final_adata.write_h5ad(
|
|
243
|
+
print('Saving initial draft of final adata')
|
|
244
|
+
final_adata.write_h5ad(final_adata_path)
|
|
245
|
+
return final_adata_path
|