smftools 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +7 -6
- smftools/_version.py +1 -1
- smftools/cli/cli_flows.py +94 -0
- smftools/cli/hmm_adata.py +338 -0
- smftools/cli/load_adata.py +577 -0
- smftools/cli/preprocess_adata.py +363 -0
- smftools/cli/spatial_adata.py +564 -0
- smftools/cli_entry.py +435 -0
- smftools/config/__init__.py +1 -0
- smftools/config/conversion.yaml +38 -0
- smftools/config/deaminase.yaml +61 -0
- smftools/config/default.yaml +264 -0
- smftools/config/direct.yaml +41 -0
- smftools/config/discover_input_files.py +115 -0
- smftools/config/experiment_config.py +1288 -0
- smftools/hmm/HMM.py +1576 -0
- smftools/hmm/__init__.py +20 -0
- smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
- smftools/hmm/call_hmm_peaks.py +106 -0
- smftools/{tools → hmm}/display_hmm.py +3 -3
- smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
- smftools/{tools → hmm}/train_hmm.py +1 -1
- smftools/informatics/__init__.py +13 -9
- smftools/informatics/archived/deaminase_smf.py +132 -0
- smftools/informatics/archived/fast5_to_pod5.py +43 -0
- smftools/informatics/archived/helpers/archived/__init__.py +71 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
- smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
- smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
- smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +30 -4
- smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
- smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
- smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +5 -4
- smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
- smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
- smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
- smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
- smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
- smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +8 -7
- smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +8 -12
- smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
- smftools/informatics/bam_functions.py +812 -0
- smftools/informatics/basecalling.py +67 -0
- smftools/informatics/bed_functions.py +366 -0
- smftools/informatics/binarize_converted_base_identities.py +172 -0
- smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +198 -50
- smftools/informatics/fasta_functions.py +255 -0
- smftools/informatics/h5ad_functions.py +197 -0
- smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +147 -61
- smftools/informatics/modkit_functions.py +129 -0
- smftools/informatics/ohe.py +160 -0
- smftools/informatics/pod5_functions.py +224 -0
- smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
- smftools/machine_learning/__init__.py +12 -0
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +234 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +31 -0
- smftools/machine_learning/evaluation/evaluators.py +223 -0
- smftools/machine_learning/inference/__init__.py +3 -0
- smftools/machine_learning/inference/inference_utils.py +27 -0
- smftools/machine_learning/inference/lightning_inference.py +68 -0
- smftools/machine_learning/inference/sklearn_inference.py +55 -0
- smftools/machine_learning/inference/sliding_window_inference.py +114 -0
- smftools/machine_learning/models/base.py +295 -0
- smftools/machine_learning/models/cnn.py +138 -0
- smftools/machine_learning/models/lightning_base.py +345 -0
- smftools/machine_learning/models/mlp.py +26 -0
- smftools/{tools → machine_learning}/models/positional.py +3 -2
- smftools/{tools → machine_learning}/models/rnn.py +2 -1
- smftools/machine_learning/models/sklearn_models.py +273 -0
- smftools/machine_learning/models/transformer.py +303 -0
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +135 -0
- smftools/machine_learning/training/train_sklearn_model.py +114 -0
- smftools/plotting/__init__.py +4 -1
- smftools/plotting/autocorrelation_plotting.py +609 -0
- smftools/plotting/general_plotting.py +1292 -140
- smftools/plotting/hmm_plotting.py +260 -0
- smftools/plotting/qc_plotting.py +270 -0
- smftools/preprocessing/__init__.py +15 -8
- smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
- smftools/preprocessing/append_base_context.py +122 -0
- smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
- smftools/preprocessing/binarize.py +17 -0
- smftools/preprocessing/binarize_on_Youden.py +2 -2
- smftools/preprocessing/calculate_complexity_II.py +248 -0
- smftools/preprocessing/calculate_coverage.py +10 -1
- smftools/preprocessing/calculate_position_Youden.py +1 -1
- smftools/preprocessing/calculate_read_modification_stats.py +101 -0
- smftools/preprocessing/clean_NaN.py +17 -1
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
- smftools/preprocessing/flag_duplicate_reads.py +1326 -124
- smftools/preprocessing/invert_adata.py +12 -5
- smftools/preprocessing/load_sample_sheet.py +19 -4
- smftools/readwrite.py +1021 -89
- smftools/tools/__init__.py +3 -32
- smftools/tools/calculate_umap.py +5 -5
- smftools/tools/general_tools.py +3 -3
- smftools/tools/position_stats.py +468 -106
- smftools/tools/read_stats.py +115 -1
- smftools/tools/spatial_autocorrelation.py +562 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/METADATA +14 -9
- smftools-0.2.3.dist-info/RECORD +173 -0
- smftools-0.2.3.dist-info/entry_points.txt +2 -0
- smftools/informatics/fast5_to_pod5.py +0 -21
- smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
- smftools/informatics/helpers/__init__.py +0 -74
- smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
- smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
- smftools/informatics/helpers/bam_qc.py +0 -66
- smftools/informatics/helpers/bed_to_bigwig.py +0 -39
- smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
- smftools/informatics/helpers/index_fasta.py +0 -12
- smftools/informatics/helpers/make_dirs.py +0 -21
- smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
- smftools/informatics/load_adata.py +0 -182
- smftools/informatics/readwrite.py +0 -106
- smftools/informatics/subsample_fasta_from_bed.py +0 -47
- smftools/preprocessing/append_C_context.py +0 -82
- smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
- smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
- smftools/preprocessing/filter_reads_on_length.py +0 -51
- smftools/tools/call_hmm_peaks.py +0 -105
- smftools/tools/data/__init__.py +0 -2
- smftools/tools/data/anndata_data_module.py +0 -90
- smftools/tools/inference/__init__.py +0 -1
- smftools/tools/inference/lightning_inference.py +0 -41
- smftools/tools/models/base.py +0 -14
- smftools/tools/models/cnn.py +0 -34
- smftools/tools/models/lightning_base.py +0 -41
- smftools/tools/models/mlp.py +0 -17
- smftools/tools/models/sklearn_models.py +0 -40
- smftools/tools/models/transformer.py +0 -133
- smftools/tools/training/__init__.py +0 -1
- smftools/tools/training/train_lightning_model.py +0 -47
- smftools-0.1.7.dist-info/RECORD +0 -136
- /smftools/{tools/evaluation → cli}/__init__.py +0 -0
- /smftools/{tools → hmm}/calculate_distances.py +0 -0
- /smftools/{tools → hmm}/hmm_readwrite.py +0 -0
- /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
- /smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
- /smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
- /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
- /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
- /smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
- /smftools/{tools → machine_learning}/models/__init__.py +0 -0
- /smftools/{tools → machine_learning}/models/wrappers.py +0 -0
- /smftools/{tools → machine_learning}/utils/__init__.py +0 -0
- /smftools/{tools → machine_learning}/utils/device.py +0 -0
- /smftools/{tools → machine_learning}/utils/grl.py +0 -0
- /smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
- /smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -18,13 +18,12 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
|
|
|
18
18
|
bam_files (list): List of split BAM file path strings
|
|
19
19
|
Splits an input BAM file on barcode value and makes a BAM index file.
|
|
20
20
|
"""
|
|
21
|
-
from
|
|
21
|
+
from ...readwrite import make_dirs
|
|
22
22
|
import os
|
|
23
23
|
import subprocess
|
|
24
24
|
import glob
|
|
25
|
-
from .make_dirs import make_dirs
|
|
26
25
|
|
|
27
|
-
input_bam = aligned_sorted_BAM
|
|
26
|
+
input_bam = aligned_sorted_BAM.with_suffix(bam_suffix)
|
|
28
27
|
command = ["dorado", "demux", "--kit-name", barcode_kit]
|
|
29
28
|
if barcode_both_ends:
|
|
30
29
|
command.append("--barcode-both-ends")
|
|
@@ -34,17 +33,16 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
|
|
|
34
33
|
command += ["-t", str(threads)]
|
|
35
34
|
else:
|
|
36
35
|
pass
|
|
37
|
-
command += ["--emit-summary", "--sort-bam", "--output-dir", split_dir]
|
|
38
|
-
command.append(input_bam)
|
|
36
|
+
command += ["--emit-summary", "--sort-bam", "--output-dir", str(split_dir)]
|
|
37
|
+
command.append(str(input_bam))
|
|
39
38
|
command_string = ' '.join(command)
|
|
40
39
|
print(f"Running: {command_string}")
|
|
41
40
|
subprocess.run(command)
|
|
42
41
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
bam_files.sort()
|
|
42
|
+
bam_files = sorted(
|
|
43
|
+
p for p in split_dir.glob(f"*{bam_suffix}")
|
|
44
|
+
if p.is_file() and p.suffix == bam_suffix and "unclassified" not in p.name
|
|
45
|
+
)
|
|
48
46
|
|
|
49
47
|
if not bam_files:
|
|
50
48
|
raise FileNotFoundError(f"No BAM files found in {split_dir} with suffix {bam_suffix}")
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
def extract_base_identities(bam_file, chromosome, positions, max_reference_length):
|
|
1
|
+
def extract_base_identities(bam_file, chromosome, positions, max_reference_length, sequence):
|
|
2
2
|
"""
|
|
3
3
|
Efficiently extracts base identities from mapped reads with reference coordinates.
|
|
4
4
|
|
|
@@ -7,6 +7,7 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
|
|
|
7
7
|
chromosome (str): Name of the reference chromosome.
|
|
8
8
|
positions (list): Positions to extract (0-based).
|
|
9
9
|
max_reference_length (int): Maximum reference length for padding.
|
|
10
|
+
sequence (str): The sequence of the record fasta
|
|
10
11
|
|
|
11
12
|
Returns:
|
|
12
13
|
dict: Base identities from forward mapped reads.
|
|
@@ -16,16 +17,19 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
|
|
|
16
17
|
import numpy as np
|
|
17
18
|
from collections import defaultdict
|
|
18
19
|
import time
|
|
20
|
+
from collections import defaultdict, Counter
|
|
19
21
|
|
|
20
22
|
timestamp = time.strftime("[%Y-%m-%d %H:%M:%S]")
|
|
21
23
|
|
|
22
24
|
positions = set(positions)
|
|
23
25
|
fwd_base_identities = defaultdict(lambda: np.full(max_reference_length, 'N', dtype='<U1'))
|
|
24
26
|
rev_base_identities = defaultdict(lambda: np.full(max_reference_length, 'N', dtype='<U1'))
|
|
27
|
+
mismatch_counts_per_read = defaultdict(lambda: defaultdict(Counter))
|
|
25
28
|
|
|
26
29
|
#print(f"{timestamp} Reading reads from {chromosome} BAM file: {bam_file}")
|
|
27
|
-
with pysam.AlignmentFile(bam_file, "rb") as bam:
|
|
30
|
+
with pysam.AlignmentFile(str(bam_file), "rb") as bam:
|
|
28
31
|
total_reads = bam.mapped
|
|
32
|
+
ref_seq = sequence.upper()
|
|
29
33
|
for read in bam.fetch(chromosome):
|
|
30
34
|
if not read.is_mapped:
|
|
31
35
|
continue # Skip unmapped reads
|
|
@@ -39,6 +43,28 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
|
|
|
39
43
|
|
|
40
44
|
for read_position, reference_position in aligned_pairs:
|
|
41
45
|
if reference_position in positions:
|
|
42
|
-
|
|
46
|
+
read_base = query_sequence[read_position]
|
|
47
|
+
ref_base = ref_seq[reference_position]
|
|
43
48
|
|
|
44
|
-
|
|
49
|
+
base_dict[read_name][reference_position] = read_base
|
|
50
|
+
|
|
51
|
+
# Track mismatches (excluding Ns)
|
|
52
|
+
if read_base != ref_base and read_base != 'N' and ref_base != 'N':
|
|
53
|
+
mismatch_counts_per_read[read_name][ref_base][read_base] += 1
|
|
54
|
+
|
|
55
|
+
# Determine C→T vs G→A dominance per read
|
|
56
|
+
mismatch_trend_per_read = {}
|
|
57
|
+
for read_name, ref_dict in mismatch_counts_per_read.items():
|
|
58
|
+
c_to_t = ref_dict.get("C", {}).get("T", 0)
|
|
59
|
+
g_to_a = ref_dict.get("G", {}).get("A", 0)
|
|
60
|
+
|
|
61
|
+
if abs(c_to_t - g_to_a) < 0.01 and c_to_t > 0:
|
|
62
|
+
mismatch_trend_per_read[read_name] = "equal"
|
|
63
|
+
elif c_to_t > g_to_a:
|
|
64
|
+
mismatch_trend_per_read[read_name] = "C->T"
|
|
65
|
+
elif g_to_a > c_to_t:
|
|
66
|
+
mismatch_trend_per_read[read_name] = "G->A"
|
|
67
|
+
else:
|
|
68
|
+
mismatch_trend_per_read[read_name] = "none"
|
|
69
|
+
|
|
70
|
+
return dict(fwd_base_identities), dict(rev_base_identities), dict(mismatch_counts_per_read), mismatch_trend_per_read
|
|
@@ -23,9 +23,9 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
|
|
|
23
23
|
import glob
|
|
24
24
|
import zipfile
|
|
25
25
|
|
|
26
|
-
os.chdir(mod_tsv_dir)
|
|
27
26
|
filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
|
|
28
|
-
bam_files = glob.glob(
|
|
27
|
+
bam_files = glob.glob(split_dir / f"*{bam_suffix}")
|
|
28
|
+
print(f"Running modkit extract for the following bam files: {bam_files}")
|
|
29
29
|
|
|
30
30
|
if threads:
|
|
31
31
|
threads = str(threads)
|
|
@@ -35,20 +35,20 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
|
|
|
35
35
|
for input_file in bam_files:
|
|
36
36
|
print(input_file)
|
|
37
37
|
# Extract the file basename
|
|
38
|
-
file_name =
|
|
38
|
+
file_name = input_file.name
|
|
39
39
|
if skip_unclassified and "unclassified" in file_name:
|
|
40
40
|
print("Skipping modkit extract on unclassified reads")
|
|
41
41
|
else:
|
|
42
42
|
# Construct the output TSV file path
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
if
|
|
46
|
-
print(f"{
|
|
43
|
+
output_tsv = mod_tsv_dir / file_name.stem + "_extract.tsv"
|
|
44
|
+
output_tsv_gz = output_tsv + '.gz'
|
|
45
|
+
if output_tsv_gz.exists():
|
|
46
|
+
print(f"{output_tsv_gz} already exists, skipping modkit extract")
|
|
47
47
|
else:
|
|
48
48
|
print(f"Extracting modification data from {input_file}")
|
|
49
49
|
if modkit_summary:
|
|
50
50
|
# Run modkit summary
|
|
51
|
-
subprocess.run(["modkit", "summary", input_file])
|
|
51
|
+
subprocess.run(["modkit", "summary", str(input_file)])
|
|
52
52
|
else:
|
|
53
53
|
pass
|
|
54
54
|
# Run modkit extract
|
|
@@ -61,7 +61,7 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
|
|
|
61
61
|
"--mod-thresholds", f"a:{m6A_threshold}",
|
|
62
62
|
"--mod-thresholds", f"h:{hm5C_threshold}",
|
|
63
63
|
"-t", threads,
|
|
64
|
-
input_file, output_tsv
|
|
64
|
+
str(input_file), str(output_tsv)
|
|
65
65
|
]
|
|
66
66
|
else:
|
|
67
67
|
extract_command = [
|
|
@@ -71,13 +71,15 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
|
|
|
71
71
|
"--mod-thresholds", f"m:{m5C_threshold}",
|
|
72
72
|
"--mod-thresholds", f"a:{m6A_threshold}",
|
|
73
73
|
"--mod-thresholds", f"h:{hm5C_threshold}",
|
|
74
|
-
input_file, output_tsv
|
|
74
|
+
str(input_file), str(output_tsv)
|
|
75
75
|
]
|
|
76
76
|
subprocess.run(extract_command)
|
|
77
77
|
# Zip the output TSV
|
|
78
78
|
print(f'zipping {output_tsv}')
|
|
79
79
|
if threads:
|
|
80
|
-
zip_command = ["pigz", "-f", "-p", threads, output_tsv]
|
|
80
|
+
zip_command = ["pigz", "-f", "-p", threads, str(output_tsv)]
|
|
81
81
|
else:
|
|
82
|
-
zip_command = ["pigz", "-f", output_tsv]
|
|
83
|
-
subprocess.run(zip_command, check=True)
|
|
82
|
+
zip_command = ["pigz", "-f", str(output_tsv)]
|
|
83
|
+
subprocess.run(zip_command, check=True)
|
|
84
|
+
|
|
85
|
+
return
|
smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py
RENAMED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
def extract_read_features_from_bam(bam_file_path):
|
|
4
4
|
"""
|
|
5
|
-
Make a dict of reads from a bam that points to a list of read metrics: read length, read median Q-score, reference length
|
|
5
|
+
Make a dict of reads from a bam that points to a list of read metrics: read length, read median Q-score, reference length, mapped length, mapping quality
|
|
6
6
|
Params:
|
|
7
7
|
bam_file_path (str):
|
|
8
8
|
Returns:
|
|
@@ -26,6 +26,8 @@ def extract_read_features_from_bam(bam_file_path):
|
|
|
26
26
|
reference_name = read.reference_name
|
|
27
27
|
reference_index = bam_file.references.index(reference_name)
|
|
28
28
|
reference_length = reference_lengths[reference_index]
|
|
29
|
-
|
|
29
|
+
mapped_length = sum(end - start for start, end in read.get_blocks())
|
|
30
|
+
mapping_quality = read.mapping_quality # Phred-scaled MAPQ
|
|
31
|
+
read_metrics[read.query_name] = [read.query_length, median_read_quality, reference_length, mapped_length, mapping_quality]
|
|
30
32
|
|
|
31
33
|
return read_metrics
|
|
@@ -1,11 +1,12 @@
|
|
|
1
|
-
def find_conversion_sites(fasta_file, modification_type,
|
|
1
|
+
def find_conversion_sites(fasta_file, modification_type, conversions, deaminase_footprinting=False):
|
|
2
2
|
"""
|
|
3
3
|
Finds genomic coordinates of modified bases (5mC or 6mA) in a reference FASTA file.
|
|
4
4
|
|
|
5
5
|
Parameters:
|
|
6
6
|
fasta_file (str): Path to the converted reference FASTA.
|
|
7
7
|
modification_type (str): Modification type ('5mC' or '6mA') or 'unconverted'.
|
|
8
|
-
|
|
8
|
+
conversions (list): List of conversion types. The first element is the unconverted record type.
|
|
9
|
+
deaminase_footprinting (bool): Whether the footprinting was done with a direct deamination chemistry.
|
|
9
10
|
|
|
10
11
|
Returns:
|
|
11
12
|
dict: Dictionary where keys are **both unconverted & converted record names**.
|
|
@@ -14,7 +15,7 @@ def find_conversion_sites(fasta_file, modification_type, conversion_types):
|
|
|
14
15
|
"""
|
|
15
16
|
import numpy as np
|
|
16
17
|
from Bio import SeqIO
|
|
17
|
-
unconverted =
|
|
18
|
+
unconverted = conversions[0]
|
|
18
19
|
record_dict = {}
|
|
19
20
|
|
|
20
21
|
# Define base mapping based on modification type
|
|
@@ -26,7 +27,7 @@ def find_conversion_sites(fasta_file, modification_type, conversion_types):
|
|
|
26
27
|
# Read FASTA file and process records
|
|
27
28
|
with open(fasta_file, "r") as f:
|
|
28
29
|
for record in SeqIO.parse(f, "fasta"):
|
|
29
|
-
if unconverted in record.id:
|
|
30
|
+
if unconverted in record.id or deaminase_footprinting:
|
|
30
31
|
sequence = str(record.seq).upper()
|
|
31
32
|
complement = str(record.seq.complement()).upper()
|
|
32
33
|
sequence_length = len(sequence)
|
|
@@ -67,6 +67,8 @@ def generate_converted_FASTA(input_fasta, modification_types, strands, output_fa
|
|
|
67
67
|
None (Writes the converted FASTA file).
|
|
68
68
|
"""
|
|
69
69
|
unconverted = modification_types[0]
|
|
70
|
+
input_fasta = str(input_fasta)
|
|
71
|
+
output_fasta = str(output_fasta)
|
|
70
72
|
|
|
71
73
|
# Detect if input is gzipped
|
|
72
74
|
open_func = gzip.open if input_fasta.endswith('.gz') else open
|
|
@@ -8,25 +8,26 @@ def get_chromosome_lengths(fasta):
|
|
|
8
8
|
fasta (str): Path to the input fasta
|
|
9
9
|
"""
|
|
10
10
|
import os
|
|
11
|
+
from pathlib import Path
|
|
11
12
|
import subprocess
|
|
12
13
|
from .index_fasta import index_fasta
|
|
13
14
|
|
|
14
15
|
# Make a fasta index file if one isn't already available
|
|
15
|
-
index_path =
|
|
16
|
-
if
|
|
16
|
+
index_path = fasta / '.fai'
|
|
17
|
+
if index_path.exists():
|
|
17
18
|
print(f'Using existing fasta index file: {index_path}')
|
|
18
19
|
else:
|
|
19
20
|
index_fasta(fasta)
|
|
20
21
|
|
|
21
|
-
parent_dir =
|
|
22
|
-
fasta_basename =
|
|
23
|
-
chrom_basename =
|
|
24
|
-
chrom_path =
|
|
22
|
+
parent_dir = fasta.parent
|
|
23
|
+
fasta_basename = fasta.name
|
|
24
|
+
chrom_basename = fasta.stem + '.chrom.sizes'
|
|
25
|
+
chrom_path = parent_dir / chrom_basename
|
|
25
26
|
|
|
26
27
|
# Make a chromosome length file
|
|
27
|
-
if
|
|
28
|
+
if chrom_path.exists():
|
|
28
29
|
print(f'Using existing chrom length index file: {chrom_path}')
|
|
29
30
|
else:
|
|
30
31
|
with open(chrom_path, 'w') as outfile:
|
|
31
|
-
command = ["cut", "-f1,2", index_path]
|
|
32
|
+
command = ["cut", "-f1,2", str(index_path)]
|
|
32
33
|
subprocess.run(command, stdout=outfile)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import pysam
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
def index_fasta(fasta: str | Path, write_chrom_sizes: bool = True) -> Path:
|
|
5
|
+
"""
|
|
6
|
+
Index a FASTA and optionally write <fasta>.chrom.sizes for bigwig/bedgraph work.
|
|
7
|
+
|
|
8
|
+
Returns
|
|
9
|
+
-------
|
|
10
|
+
Path: path to chrom.sizes file (if requested), else .fai
|
|
11
|
+
"""
|
|
12
|
+
fasta = Path(fasta)
|
|
13
|
+
pysam.faidx(str(fasta)) # makes fasta.fai
|
|
14
|
+
|
|
15
|
+
if write_chrom_sizes:
|
|
16
|
+
fai = fasta.with_suffix(fasta.suffix + ".fai")
|
|
17
|
+
chrom_sizes = fasta.with_suffix(".chrom.sizes")
|
|
18
|
+
with open(fai) as f_in, open(chrom_sizes, "w") as out:
|
|
19
|
+
for line in f_in:
|
|
20
|
+
chrom, size = line.split()[:2]
|
|
21
|
+
out.write(f"{chrom}\t{size}\n")
|
|
22
|
+
return chrom_sizes
|
|
23
|
+
|
|
24
|
+
return fasta.with_suffix(fasta.suffix + ".fai")
|
|
@@ -13,10 +13,9 @@ def make_modbed(aligned_sorted_output, thresholds, mod_bed_dir):
|
|
|
13
13
|
import os
|
|
14
14
|
import subprocess
|
|
15
15
|
|
|
16
|
-
os.chdir(mod_bed_dir)
|
|
17
16
|
filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
|
|
18
17
|
command = [
|
|
19
|
-
"modkit", "pileup", aligned_sorted_output, mod_bed_dir,
|
|
18
|
+
"modkit", "pileup", str(aligned_sorted_output), str(mod_bed_dir),
|
|
20
19
|
"--partition-tag", "BC",
|
|
21
20
|
"--only-tabs",
|
|
22
21
|
"--filter-threshold", f'{filter_threshold}',
|
|
@@ -16,9 +16,9 @@ def modQC(aligned_sorted_output, thresholds):
|
|
|
16
16
|
import subprocess
|
|
17
17
|
|
|
18
18
|
filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
|
|
19
|
-
subprocess.run(["modkit", "sample-probs", aligned_sorted_output])
|
|
19
|
+
subprocess.run(["modkit", "sample-probs", str(aligned_sorted_output)])
|
|
20
20
|
command = [
|
|
21
|
-
"modkit", "summary", aligned_sorted_output,
|
|
21
|
+
"modkit", "summary", str(aligned_sorted_output),
|
|
22
22
|
"--filter-threshold", f"{filter_threshold}",
|
|
23
23
|
"--mod-thresholds", f"m:{m5C_threshold}",
|
|
24
24
|
"--mod-thresholds", f"a:{m6A_threshold}",
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
# plot_bed_histograms
|
|
2
|
+
|
|
3
|
+
def plot_bed_histograms(
|
|
4
|
+
bed_file,
|
|
5
|
+
plotting_directory,
|
|
6
|
+
fasta,
|
|
7
|
+
*,
|
|
8
|
+
bins=60,
|
|
9
|
+
clip_quantiles=(0.0, 0.995),
|
|
10
|
+
cov_bin_size=1000, # coverage bin size in bp
|
|
11
|
+
rows_per_fig=6, # paginate if many chromosomes
|
|
12
|
+
include_mapq_quality=True, # add MAPQ + avg read quality columns to grid
|
|
13
|
+
coordinate_mode="one_based", # "one_based" (your BED-like) or "zero_based"
|
|
14
|
+
):
|
|
15
|
+
"""
|
|
16
|
+
Plot per-chromosome QC grids from a BED-like file.
|
|
17
|
+
|
|
18
|
+
Expects columns:
|
|
19
|
+
chrom, start, end, read_len, qname, mapq, avg_base_qual
|
|
20
|
+
|
|
21
|
+
For each chromosome:
|
|
22
|
+
- Column 1: Read length histogram
|
|
23
|
+
- Column 2: Coverage across the chromosome (binned)
|
|
24
|
+
- (optional) Column 3: MAPQ histogram
|
|
25
|
+
- (optional) Column 4: Avg base quality histogram
|
|
26
|
+
|
|
27
|
+
The figure is paginated: rows = chromosomes (up to rows_per_fig), columns depend on include_mapq_quality.
|
|
28
|
+
Saves one PNG per page under `plotting_directory`.
|
|
29
|
+
|
|
30
|
+
Parameters
|
|
31
|
+
----------
|
|
32
|
+
bed_file : str
|
|
33
|
+
plotting_directory : str
|
|
34
|
+
fasta : str
|
|
35
|
+
Reference FASTA (used to get chromosome lengths).
|
|
36
|
+
bins : int
|
|
37
|
+
Histogram bins for read length / MAPQ / quality.
|
|
38
|
+
clip_quantiles : (float, float)
|
|
39
|
+
Clip hist tails for readability (e.g., (0, 0.995)).
|
|
40
|
+
cov_bin_size : int
|
|
41
|
+
Bin size (bp) for coverage plot; bigger = faster/coarser.
|
|
42
|
+
rows_per_fig : int
|
|
43
|
+
Number of chromosomes per page.
|
|
44
|
+
include_mapq_quality : bool
|
|
45
|
+
If True, add MAPQ and avg base quality histograms as extra columns.
|
|
46
|
+
coordinate_mode : {"one_based","zero_based"}
|
|
47
|
+
One-based, inclusive (your file) vs BED-standard zero-based, half-open.
|
|
48
|
+
"""
|
|
49
|
+
import os
|
|
50
|
+
import numpy as np
|
|
51
|
+
import pandas as pd
|
|
52
|
+
import matplotlib.pyplot as plt
|
|
53
|
+
import pysam
|
|
54
|
+
|
|
55
|
+
os.makedirs(plotting_directory, exist_ok=True)
|
|
56
|
+
|
|
57
|
+
bed_basename = os.path.basename(bed_file).rsplit(".bed", 1)[0]
|
|
58
|
+
print(f"[plot_bed_histograms] Loading: {bed_file}")
|
|
59
|
+
|
|
60
|
+
# Load BED-like table
|
|
61
|
+
cols = ['chrom', 'start', 'end', 'read_len', 'qname', 'mapq', 'avg_q']
|
|
62
|
+
df = pd.read_csv(bed_file, sep="\t", header=None, names=cols, dtype={
|
|
63
|
+
'chrom': str, 'start': int, 'end': int, 'read_len': int, 'qname': str,
|
|
64
|
+
'mapq': float, 'avg_q': float
|
|
65
|
+
})
|
|
66
|
+
|
|
67
|
+
# Drop unaligned records (chrom == '*') if present
|
|
68
|
+
df = df[df['chrom'] != '*'].copy()
|
|
69
|
+
if df.empty:
|
|
70
|
+
print("[plot_bed_histograms] No aligned reads found; nothing to plot.")
|
|
71
|
+
return
|
|
72
|
+
|
|
73
|
+
# Ensure coordinate mode consistent; convert to 0-based half-open for bin math internally
|
|
74
|
+
# Input is typically one_based inclusive (from your writer).
|
|
75
|
+
if coordinate_mode not in {"one_based", "zero_based"}:
|
|
76
|
+
raise ValueError("coordinate_mode must be 'one_based' or 'zero_based'")
|
|
77
|
+
|
|
78
|
+
if coordinate_mode == "one_based":
|
|
79
|
+
# convert to 0-based half-open [start0, end0)
|
|
80
|
+
start0 = df['start'].to_numpy() - 1
|
|
81
|
+
end0 = df['end'].to_numpy() # inclusive in input -> +1 already handled by not subtracting
|
|
82
|
+
else:
|
|
83
|
+
# already 0-based half-open (assumption)
|
|
84
|
+
start0 = df['start'].to_numpy()
|
|
85
|
+
end0 = df['end'].to_numpy()
|
|
86
|
+
|
|
87
|
+
# Clip helper for hist tails
|
|
88
|
+
def _clip_series(s, q=(0.0, 0.995)):
|
|
89
|
+
if q is None:
|
|
90
|
+
return s.to_numpy()
|
|
91
|
+
lo = s.quantile(q[0]) if q[0] is not None else s.min()
|
|
92
|
+
hi = s.quantile(q[1]) if q[1] is not None else s.max()
|
|
93
|
+
x = s.to_numpy(dtype=float)
|
|
94
|
+
return np.clip(x, lo, hi)
|
|
95
|
+
|
|
96
|
+
# Load chromosome order/lengths from FASTA
|
|
97
|
+
with pysam.FastaFile(fasta) as fa:
|
|
98
|
+
ref_names = list(fa.references)
|
|
99
|
+
ref_lengths = dict(zip(ref_names, fa.lengths))
|
|
100
|
+
|
|
101
|
+
# Keep only chroms present in FASTA and with at least one read
|
|
102
|
+
chroms = [c for c in df['chrom'].unique() if c in ref_lengths]
|
|
103
|
+
# Order chromosomes by FASTA order
|
|
104
|
+
chrom_order = [c for c in ref_names if c in chroms]
|
|
105
|
+
|
|
106
|
+
if not chrom_order:
|
|
107
|
+
print("[plot_bed_histograms] No chromosomes from BED are present in FASTA; aborting.")
|
|
108
|
+
return
|
|
109
|
+
|
|
110
|
+
# Pagination
|
|
111
|
+
def _sanitize(name: str) -> str:
|
|
112
|
+
return "".join(ch if ch.isalnum() or ch in "-._" else "_" for ch in name)
|
|
113
|
+
|
|
114
|
+
cols_per_fig = 4 if include_mapq_quality else 2
|
|
115
|
+
|
|
116
|
+
for start_idx in range(0, len(chrom_order), rows_per_fig):
|
|
117
|
+
chunk = chrom_order[start_idx:start_idx + rows_per_fig]
|
|
118
|
+
nrows = len(chunk)
|
|
119
|
+
ncols = cols_per_fig
|
|
120
|
+
|
|
121
|
+
fig, axes = plt.subplots(
|
|
122
|
+
nrows=nrows, ncols=ncols,
|
|
123
|
+
figsize=(4.0 * ncols, 2.6 * nrows),
|
|
124
|
+
dpi=160,
|
|
125
|
+
squeeze=False
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
for r, chrom in enumerate(chunk):
|
|
129
|
+
chrom_len = ref_lengths[chrom]
|
|
130
|
+
mask = (df['chrom'].to_numpy() == chrom)
|
|
131
|
+
|
|
132
|
+
# Slice per-chrom arrays for speed
|
|
133
|
+
s0 = start0[mask]
|
|
134
|
+
e0 = end0[mask]
|
|
135
|
+
len_arr = df.loc[mask, 'read_len']
|
|
136
|
+
mapq_arr = df.loc[mask, 'mapq']
|
|
137
|
+
q_arr = df.loc[mask, 'avg_q']
|
|
138
|
+
|
|
139
|
+
# --- Col 1: Read length histogram (clipped) ---
|
|
140
|
+
ax = axes[r, 0]
|
|
141
|
+
ax.hist(_clip_series(len_arr, clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
|
|
142
|
+
if r == 0:
|
|
143
|
+
ax.set_title("Read length")
|
|
144
|
+
ax.set_ylabel(f"{chrom}\n(n={mask.sum()})")
|
|
145
|
+
ax.set_xlabel("bp")
|
|
146
|
+
ax.grid(alpha=0.25)
|
|
147
|
+
|
|
148
|
+
# --- Col 2: Coverage (binned over genome) ---
|
|
149
|
+
ax = axes[r, 1]
|
|
150
|
+
nb = max(1, int(np.ceil(chrom_len / cov_bin_size)))
|
|
151
|
+
# Bin edges in 0-based coords
|
|
152
|
+
edges = np.linspace(0, chrom_len, nb + 1, dtype=int)
|
|
153
|
+
|
|
154
|
+
# Compute per-bin "read count coverage": number of reads overlapping each bin.
|
|
155
|
+
# Approximate by incrementing all bins touched by the interval.
|
|
156
|
+
# (Fast and memory-light; for exact base coverage use smaller cov_bin_size.)
|
|
157
|
+
cov = np.zeros(nb, dtype=np.int32)
|
|
158
|
+
# bin indices overlapped by each read (0-based half-open)
|
|
159
|
+
b0 = np.minimum(np.searchsorted(edges, s0, side="right") - 1, nb - 1)
|
|
160
|
+
b1 = np.maximum(np.searchsorted(edges, np.maximum(e0 - 1, 0), side="right") - 1, 0)
|
|
161
|
+
# ensure valid ordering
|
|
162
|
+
b_lo = np.minimum(b0, b1)
|
|
163
|
+
b_hi = np.maximum(b0, b1)
|
|
164
|
+
|
|
165
|
+
# Increment all bins in range; loop but at bin resolution (fast for reasonable cov_bin_size).
|
|
166
|
+
for lo, hi in zip(b_lo, b_hi):
|
|
167
|
+
cov[lo:hi + 1] += 1
|
|
168
|
+
|
|
169
|
+
x_mid = (edges[:-1] + edges[1:]) / 2.0
|
|
170
|
+
ax.plot(x_mid, cov)
|
|
171
|
+
if r == 0:
|
|
172
|
+
ax.set_title(f"Coverage (~{cov_bin_size} bp bins)")
|
|
173
|
+
ax.set_xlim(0, chrom_len)
|
|
174
|
+
ax.set_xlabel("Position (bp)")
|
|
175
|
+
ax.set_ylabel("") # already show chrom on col 1
|
|
176
|
+
ax.grid(alpha=0.25)
|
|
177
|
+
|
|
178
|
+
if include_mapq_quality:
|
|
179
|
+
# --- Col 3: MAPQ ---
|
|
180
|
+
ax = axes[r, 2]
|
|
181
|
+
# Clip MAPQ upper tail if needed (usually 60)
|
|
182
|
+
ax.hist(_clip_series(mapq_arr.fillna(0), clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
|
|
183
|
+
if r == 0:
|
|
184
|
+
ax.set_title("MAPQ")
|
|
185
|
+
ax.set_xlabel("MAPQ")
|
|
186
|
+
ax.grid(alpha=0.25)
|
|
187
|
+
|
|
188
|
+
# --- Col 4: Avg base quality ---
|
|
189
|
+
ax = axes[r, 3]
|
|
190
|
+
ax.hist(_clip_series(q_arr.fillna(np.nan), clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
|
|
191
|
+
if r == 0:
|
|
192
|
+
ax.set_title("Avg base qual")
|
|
193
|
+
ax.set_xlabel("Phred")
|
|
194
|
+
ax.grid(alpha=0.25)
|
|
195
|
+
|
|
196
|
+
fig.suptitle(
|
|
197
|
+
f"{bed_basename} — per-chromosome QC "
|
|
198
|
+
f"({'len,cov,MAPQ,qual' if include_mapq_quality else 'len,cov'})",
|
|
199
|
+
y=0.995, fontsize=11
|
|
200
|
+
)
|
|
201
|
+
fig.tight_layout(rect=[0, 0, 1, 0.98])
|
|
202
|
+
|
|
203
|
+
page = start_idx // rows_per_fig + 1
|
|
204
|
+
out_png = os.path.join(plotting_directory, f"{_sanitize(bed_basename)}_qc_page{page}.png")
|
|
205
|
+
plt.savefig(out_png, bbox_inches="tight")
|
|
206
|
+
plt.close(fig)
|
|
207
|
+
|
|
208
|
+
print("[plot_bed_histograms] Done.")
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
# bed_basename = os.path.basename(bed_file).split('.bed')[0]
|
|
212
|
+
# # Load the BED file into a DataFrame
|
|
213
|
+
# print(f"Loading BED to plot read length and coverage histograms: {bed_file}")
|
|
214
|
+
# df = pd.read_csv(bed_file, sep='\t', header=None, names=['chromosome', 'start', 'end', 'length', 'read_name', 'mapq', 'read_quality'])
|
|
215
|
+
|
|
216
|
+
# # Group by chromosome
|
|
217
|
+
# grouped = df.groupby('chromosome')
|
|
218
|
+
|
|
219
|
+
# # for each chromosome, get the record length of that chromosome from the fasta. Use from 0 to this length for the positional coverage plot.
|
|
220
|
+
|
|
221
|
+
# # Change below and make a plot grid instead. For each, make row for chromsome, col for read length and coverage
|
|
222
|
+
# # Clip the outliers to make plots cleaner
|
|
223
|
+
|
|
224
|
+
# for chrom, group in grouped:
|
|
225
|
+
# # Plot read length histogram
|
|
226
|
+
# plt.figure(figsize=(12, 6))
|
|
227
|
+
# plt.hist(group['length'], bins=50, edgecolor='k', alpha=0.7)
|
|
228
|
+
# plt.title(f'Read Length Histogram of reads aligned to {chrom}')
|
|
229
|
+
# plt.xlabel('Read Length')
|
|
230
|
+
# plt.ylabel('Count')
|
|
231
|
+
# plt.grid(True)
|
|
232
|
+
# save_name = os.path.join(plotting_directory, f'{bed_basename}_{chrom}_read_length_histogram.png')
|
|
233
|
+
# plt.savefig(save_name)
|
|
234
|
+
# plt.close()
|
|
235
|
+
|
|
236
|
+
# # Compute coverage
|
|
237
|
+
# coverage = np.zeros(group['end'].max())
|
|
238
|
+
# for _, row in group.iterrows():
|
|
239
|
+
# coverage[row['start']:row['end']] += 1
|
|
240
|
+
|
|
241
|
+
# # Plot coverage histogram
|
|
242
|
+
# plt.figure(figsize=(12, 6))
|
|
243
|
+
# plt.plot(coverage, color='b')
|
|
244
|
+
# plt.title(f'Coverage Histogram for {chrom}')
|
|
245
|
+
# plt.xlabel('Position')
|
|
246
|
+
# plt.ylabel('Coverage')
|
|
247
|
+
# plt.grid(True)
|
|
248
|
+
# save_name = os.path.join(plotting_directory, f'{bed_basename}_{chrom}_coverage_histogram.png')
|
|
249
|
+
# plt.savefig(save_name)
|
|
250
|
+
# plt.close()
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
## separate_bam_by_bc
|
|
2
2
|
|
|
3
|
-
# General
|
|
4
3
|
def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
|
|
5
4
|
"""
|
|
6
5
|
Separates an input BAM file on the BC SAM tag values.
|
|
@@ -16,24 +15,26 @@ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
|
|
|
16
15
|
Writes out split BAM files.
|
|
17
16
|
"""
|
|
18
17
|
import pysam
|
|
18
|
+
from pathlib import Path
|
|
19
19
|
import os
|
|
20
20
|
|
|
21
|
-
bam_base =
|
|
22
|
-
bam_base_minus_suffix =
|
|
21
|
+
bam_base = input_bam.name
|
|
22
|
+
bam_base_minus_suffix = input_bam.stem
|
|
23
23
|
|
|
24
24
|
# Open the input BAM file for reading
|
|
25
|
-
with pysam.AlignmentFile(input_bam, "rb") as bam:
|
|
25
|
+
with pysam.AlignmentFile(str(input_bam), "rb") as bam:
|
|
26
26
|
# Create a dictionary to store output BAM files
|
|
27
27
|
output_files = {}
|
|
28
28
|
# Iterate over each read in the BAM file
|
|
29
29
|
for read in bam:
|
|
30
30
|
try:
|
|
31
31
|
# Get the barcode tag value
|
|
32
|
-
bc_tag = read.get_tag("BC", with_value_type=True)[0]
|
|
32
|
+
bc_tag = read.get_tag("BC", with_value_type=True)[0]
|
|
33
|
+
#bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
|
|
33
34
|
# Open the output BAM file corresponding to the barcode
|
|
34
35
|
if bc_tag not in output_files:
|
|
35
|
-
output_path =
|
|
36
|
-
output_files[bc_tag] = pysam.AlignmentFile(output_path, "wb", header=bam.header)
|
|
36
|
+
output_path = split_dir / f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}"
|
|
37
|
+
output_files[bc_tag] = pysam.AlignmentFile(str(output_path), "wb", header=bam.header)
|
|
37
38
|
# Write the read to the corresponding output BAM file
|
|
38
39
|
output_files[bc_tag].write(read)
|
|
39
40
|
except KeyError:
|
|
@@ -1,36 +1,32 @@
|
|
|
1
1
|
## split_and_index_BAM
|
|
2
2
|
|
|
3
|
-
def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix
|
|
3
|
+
def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
|
|
4
4
|
"""
|
|
5
5
|
A wrapper function for splitting BAMS and indexing them.
|
|
6
6
|
Parameters:
|
|
7
7
|
aligned_sorted_BAM (str): A string representing the file path of the aligned_sorted BAM file.
|
|
8
8
|
split_dir (str): A string representing the file path to the directory to split the BAMs into.
|
|
9
9
|
bam_suffix (str): A suffix to add to the bam file.
|
|
10
|
-
output_directory (str): A file path to the directory to output all the analyses.
|
|
11
10
|
|
|
12
11
|
Returns:
|
|
13
12
|
None
|
|
14
13
|
Splits an input BAM file on barcode value and makes a BAM index file.
|
|
15
14
|
"""
|
|
16
|
-
from
|
|
15
|
+
from ...readwrite import date_string, make_dirs
|
|
16
|
+
from pathlib import Path
|
|
17
17
|
import os
|
|
18
|
-
import
|
|
18
|
+
import pysam
|
|
19
19
|
import glob
|
|
20
20
|
from .separate_bam_by_bc import separate_bam_by_bc
|
|
21
|
-
from .make_dirs import make_dirs
|
|
22
21
|
|
|
23
|
-
plotting_dir = os.path.join(output_directory, 'demultiplexed_bed_histograms')
|
|
24
|
-
bed_dir = os.path.join(output_directory, 'demultiplexed_read_alignment_coordinates')
|
|
25
|
-
make_dirs([plotting_dir, bed_dir])
|
|
26
22
|
aligned_sorted_output = aligned_sorted_BAM + bam_suffix
|
|
27
|
-
file_prefix =
|
|
23
|
+
file_prefix = date_string()
|
|
28
24
|
separate_bam_by_bc(aligned_sorted_output, file_prefix, bam_suffix, split_dir)
|
|
29
25
|
# Make a BAM index file for the BAMs in that directory
|
|
30
26
|
bam_pattern = '*' + bam_suffix
|
|
31
|
-
bam_files = glob.glob(
|
|
32
|
-
bam_files = [bam for bam in bam_files if '.bai' not in bam]
|
|
27
|
+
bam_files = glob.glob(split_dir / bam_pattern)
|
|
28
|
+
bam_files = [str(bam) for bam in bam_files if '.bai' not in str(bam)]
|
|
33
29
|
for input_file in bam_files:
|
|
34
|
-
|
|
30
|
+
pysam.index(input_file)
|
|
35
31
|
|
|
36
32
|
return bam_files
|