smftools 0.1.6__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +34 -0
- smftools/_settings.py +20 -0
- smftools/_version.py +1 -0
- smftools/cli.py +184 -0
- smftools/config/__init__.py +1 -0
- smftools/config/conversion.yaml +33 -0
- smftools/config/deaminase.yaml +56 -0
- smftools/config/default.yaml +253 -0
- smftools/config/direct.yaml +17 -0
- smftools/config/experiment_config.py +1191 -0
- smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
- smftools/datasets/F1_sample_sheet.csv +5 -0
- smftools/datasets/__init__.py +9 -0
- smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
- smftools/datasets/datasets.py +28 -0
- smftools/hmm/HMM.py +1576 -0
- smftools/hmm/__init__.py +20 -0
- smftools/hmm/apply_hmm_batched.py +242 -0
- smftools/hmm/calculate_distances.py +18 -0
- smftools/hmm/call_hmm_peaks.py +106 -0
- smftools/hmm/display_hmm.py +18 -0
- smftools/hmm/hmm_readwrite.py +16 -0
- smftools/hmm/nucleosome_hmm_refinement.py +104 -0
- smftools/hmm/train_hmm.py +78 -0
- smftools/informatics/__init__.py +14 -0
- smftools/informatics/archived/bam_conversion.py +59 -0
- smftools/informatics/archived/bam_direct.py +63 -0
- smftools/informatics/archived/basecalls_to_adata.py +71 -0
- smftools/informatics/archived/conversion_smf.py +132 -0
- smftools/informatics/archived/deaminase_smf.py +132 -0
- smftools/informatics/archived/direct_smf.py +137 -0
- smftools/informatics/archived/print_bam_query_seq.py +29 -0
- smftools/informatics/basecall_pod5s.py +80 -0
- smftools/informatics/fast5_to_pod5.py +24 -0
- smftools/informatics/helpers/__init__.py +73 -0
- smftools/informatics/helpers/align_and_sort_BAM.py +86 -0
- smftools/informatics/helpers/aligned_BAM_to_bed.py +85 -0
- smftools/informatics/helpers/archived/informatics.py +260 -0
- smftools/informatics/helpers/archived/load_adata.py +516 -0
- smftools/informatics/helpers/bam_qc.py +66 -0
- smftools/informatics/helpers/bed_to_bigwig.py +39 -0
- smftools/informatics/helpers/binarize_converted_base_identities.py +172 -0
- smftools/informatics/helpers/canoncall.py +34 -0
- smftools/informatics/helpers/complement_base_list.py +21 -0
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +378 -0
- smftools/informatics/helpers/converted_BAM_to_adata.py +245 -0
- smftools/informatics/helpers/converted_BAM_to_adata_II.py +505 -0
- smftools/informatics/helpers/count_aligned_reads.py +43 -0
- smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
- smftools/informatics/helpers/discover_input_files.py +100 -0
- smftools/informatics/helpers/extract_base_identities.py +70 -0
- smftools/informatics/helpers/extract_mods.py +83 -0
- smftools/informatics/helpers/extract_read_features_from_bam.py +33 -0
- smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
- smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
- smftools/informatics/helpers/find_conversion_sites.py +51 -0
- smftools/informatics/helpers/generate_converted_FASTA.py +99 -0
- smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
- smftools/informatics/helpers/get_native_references.py +28 -0
- smftools/informatics/helpers/index_fasta.py +12 -0
- smftools/informatics/helpers/make_dirs.py +21 -0
- smftools/informatics/helpers/make_modbed.py +27 -0
- smftools/informatics/helpers/modQC.py +27 -0
- smftools/informatics/helpers/modcall.py +36 -0
- smftools/informatics/helpers/modkit_extract_to_adata.py +887 -0
- smftools/informatics/helpers/ohe_batching.py +76 -0
- smftools/informatics/helpers/ohe_layers_decode.py +32 -0
- smftools/informatics/helpers/one_hot_decode.py +27 -0
- smftools/informatics/helpers/one_hot_encode.py +57 -0
- smftools/informatics/helpers/plot_bed_histograms.py +269 -0
- smftools/informatics/helpers/run_multiqc.py +28 -0
- smftools/informatics/helpers/separate_bam_by_bc.py +43 -0
- smftools/informatics/helpers/split_and_index_BAM.py +32 -0
- smftools/informatics/readwrite.py +106 -0
- smftools/informatics/subsample_fasta_from_bed.py +47 -0
- smftools/informatics/subsample_pod5.py +104 -0
- smftools/load_adata.py +1346 -0
- smftools/machine_learning/__init__.py +12 -0
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +234 -0
- smftools/machine_learning/data/preprocessing.py +6 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +31 -0
- smftools/machine_learning/evaluation/evaluators.py +223 -0
- smftools/machine_learning/inference/__init__.py +3 -0
- smftools/machine_learning/inference/inference_utils.py +27 -0
- smftools/machine_learning/inference/lightning_inference.py +68 -0
- smftools/machine_learning/inference/sklearn_inference.py +55 -0
- smftools/machine_learning/inference/sliding_window_inference.py +114 -0
- smftools/machine_learning/models/__init__.py +9 -0
- smftools/machine_learning/models/base.py +295 -0
- smftools/machine_learning/models/cnn.py +138 -0
- smftools/machine_learning/models/lightning_base.py +345 -0
- smftools/machine_learning/models/mlp.py +26 -0
- smftools/machine_learning/models/positional.py +18 -0
- smftools/machine_learning/models/rnn.py +17 -0
- smftools/machine_learning/models/sklearn_models.py +273 -0
- smftools/machine_learning/models/transformer.py +303 -0
- smftools/machine_learning/models/wrappers.py +20 -0
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +135 -0
- smftools/machine_learning/training/train_sklearn_model.py +114 -0
- smftools/machine_learning/utils/__init__.py +2 -0
- smftools/machine_learning/utils/device.py +10 -0
- smftools/machine_learning/utils/grl.py +14 -0
- smftools/plotting/__init__.py +18 -0
- smftools/plotting/autocorrelation_plotting.py +611 -0
- smftools/plotting/classifiers.py +355 -0
- smftools/plotting/general_plotting.py +682 -0
- smftools/plotting/hmm_plotting.py +260 -0
- smftools/plotting/position_stats.py +462 -0
- smftools/plotting/qc_plotting.py +270 -0
- smftools/preprocessing/__init__.py +38 -0
- smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
- smftools/preprocessing/append_base_context.py +122 -0
- smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
- smftools/preprocessing/archives/mark_duplicates.py +146 -0
- smftools/preprocessing/archives/preprocessing.py +614 -0
- smftools/preprocessing/archives/remove_duplicates.py +21 -0
- smftools/preprocessing/binarize_on_Youden.py +45 -0
- smftools/preprocessing/binary_layers_to_ohe.py +40 -0
- smftools/preprocessing/calculate_complexity.py +72 -0
- smftools/preprocessing/calculate_complexity_II.py +248 -0
- smftools/preprocessing/calculate_consensus.py +47 -0
- smftools/preprocessing/calculate_coverage.py +51 -0
- smftools/preprocessing/calculate_pairwise_differences.py +49 -0
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +27 -0
- smftools/preprocessing/calculate_position_Youden.py +115 -0
- smftools/preprocessing/calculate_read_length_stats.py +79 -0
- smftools/preprocessing/calculate_read_modification_stats.py +101 -0
- smftools/preprocessing/clean_NaN.py +62 -0
- smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
- smftools/preprocessing/flag_duplicate_reads.py +1351 -0
- smftools/preprocessing/invert_adata.py +37 -0
- smftools/preprocessing/load_sample_sheet.py +53 -0
- smftools/preprocessing/make_dirs.py +21 -0
- smftools/preprocessing/min_non_diagonal.py +25 -0
- smftools/preprocessing/recipes.py +127 -0
- smftools/preprocessing/subsample_adata.py +58 -0
- smftools/readwrite.py +1004 -0
- smftools/tools/__init__.py +20 -0
- smftools/tools/archived/apply_hmm.py +202 -0
- smftools/tools/archived/classifiers.py +787 -0
- smftools/tools/archived/classify_methylated_features.py +66 -0
- smftools/tools/archived/classify_non_methylated_features.py +75 -0
- smftools/tools/archived/subset_adata_v1.py +32 -0
- smftools/tools/archived/subset_adata_v2.py +46 -0
- smftools/tools/calculate_umap.py +62 -0
- smftools/tools/cluster_adata_on_methylation.py +105 -0
- smftools/tools/general_tools.py +69 -0
- smftools/tools/position_stats.py +601 -0
- smftools/tools/read_stats.py +184 -0
- smftools/tools/spatial_autocorrelation.py +562 -0
- smftools/tools/subset_adata.py +28 -0
- {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/METADATA +9 -2
- smftools-0.2.1.dist-info/RECORD +161 -0
- smftools-0.2.1.dist-info/entry_points.txt +2 -0
- smftools-0.1.6.dist-info/RECORD +0 -4
- {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/WHEEL +0 -0
- {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Dict, List, Any, Tuple
|
|
3
|
+
|
|
4
|
+
def discover_input_files(
|
|
5
|
+
input_data_path: str,
|
|
6
|
+
bam_suffix: str = ".bam",
|
|
7
|
+
recursive: bool = False,
|
|
8
|
+
follow_symlinks: bool = False,
|
|
9
|
+
) -> Dict[str, Any]:
|
|
10
|
+
"""
|
|
11
|
+
Discover input files under `input_data_path`.
|
|
12
|
+
|
|
13
|
+
Returns a dict with:
|
|
14
|
+
- pod5_paths, fast5_paths, fastq_paths, bam_paths (lists of str)
|
|
15
|
+
- input_is_pod5, input_is_fast5, input_is_fastq, input_is_bam (bools)
|
|
16
|
+
- all_files_searched (int)
|
|
17
|
+
Behavior:
|
|
18
|
+
- If `input_data_path` is a file, returns that single file categorized.
|
|
19
|
+
- If it is a directory, scans either immediate children (recursive=False)
|
|
20
|
+
or entire tree (recursive=True). Uses Path.suffixes to detect .fastq.gz etc.
|
|
21
|
+
"""
|
|
22
|
+
p = Path(input_data_path)
|
|
23
|
+
pod5_exts = {".pod5", ".p5"}
|
|
24
|
+
fast5_exts = {".fast5", ".f5"}
|
|
25
|
+
fastq_exts = {".fastq", ".fq", ".fastq.gz", ".fq.gz", ".fastq.xz", ".fq.xz"}
|
|
26
|
+
# normalize bam suffix with leading dot
|
|
27
|
+
if not bam_suffix.startswith("."):
|
|
28
|
+
bam_suffix = "." + bam_suffix
|
|
29
|
+
bam_suffix = bam_suffix.lower()
|
|
30
|
+
|
|
31
|
+
pod5_paths: List[str] = []
|
|
32
|
+
fast5_paths: List[str] = []
|
|
33
|
+
fastq_paths: List[str] = []
|
|
34
|
+
bam_paths: List[str] = []
|
|
35
|
+
other_paths: List[str] = []
|
|
36
|
+
|
|
37
|
+
def _file_ext_key(pp: Path) -> str:
|
|
38
|
+
# join suffixes to handle .fastq.gz
|
|
39
|
+
return "".join(pp.suffixes).lower() if pp.suffixes else pp.suffix.lower()
|
|
40
|
+
|
|
41
|
+
if p.exists() and p.is_file():
|
|
42
|
+
ext_key = _file_ext_key(p)
|
|
43
|
+
if ext_key in pod5_exts:
|
|
44
|
+
pod5_paths.append(str(p))
|
|
45
|
+
elif ext_key in fast5_exts:
|
|
46
|
+
fast5_paths.append(str(p))
|
|
47
|
+
elif ext_key in fastq_exts:
|
|
48
|
+
fastq_paths.append(str(p))
|
|
49
|
+
elif ext_key == bam_suffix:
|
|
50
|
+
bam_paths.append(str(p))
|
|
51
|
+
else:
|
|
52
|
+
other_paths.append(str(p))
|
|
53
|
+
total_searched = 1
|
|
54
|
+
elif p.exists() and p.is_dir():
|
|
55
|
+
if recursive:
|
|
56
|
+
iterator = p.rglob("*")
|
|
57
|
+
else:
|
|
58
|
+
iterator = p.iterdir()
|
|
59
|
+
total_searched = 0
|
|
60
|
+
for fp in iterator:
|
|
61
|
+
if not fp.is_file():
|
|
62
|
+
continue
|
|
63
|
+
total_searched += 1
|
|
64
|
+
ext_key = _file_ext_key(fp)
|
|
65
|
+
if ext_key in pod5_exts:
|
|
66
|
+
pod5_paths.append(str(fp))
|
|
67
|
+
elif ext_key in fast5_exts:
|
|
68
|
+
fast5_paths.append(str(fp))
|
|
69
|
+
elif ext_key in fastq_exts:
|
|
70
|
+
fastq_paths.append(str(fp))
|
|
71
|
+
elif ext_key == bam_suffix:
|
|
72
|
+
bam_paths.append(str(fp))
|
|
73
|
+
else:
|
|
74
|
+
# additional heuristic: check filename contains extension fragments (.pod5 etc)
|
|
75
|
+
name = fp.name.lower()
|
|
76
|
+
if any(e in name for e in pod5_exts):
|
|
77
|
+
pod5_paths.append(str(fp))
|
|
78
|
+
elif any(e in name for e in fast5_exts):
|
|
79
|
+
fast5_paths.append(str(fp))
|
|
80
|
+
elif any(e in name for e in [".fastq", ".fq"]):
|
|
81
|
+
fastq_paths.append(str(fp))
|
|
82
|
+
elif name.endswith(bam_suffix):
|
|
83
|
+
bam_paths.append(str(fp))
|
|
84
|
+
else:
|
|
85
|
+
other_paths.append(str(fp))
|
|
86
|
+
else:
|
|
87
|
+
raise FileNotFoundError(f"input_data_path does not exist: {input_data_path}")
|
|
88
|
+
|
|
89
|
+
return {
|
|
90
|
+
"pod5_paths": sorted(pod5_paths),
|
|
91
|
+
"fast5_paths": sorted(fast5_paths),
|
|
92
|
+
"fastq_paths": sorted(fastq_paths),
|
|
93
|
+
"bam_paths": sorted(bam_paths),
|
|
94
|
+
"other_paths": sorted(other_paths),
|
|
95
|
+
"input_is_pod5": len(pod5_paths) > 0,
|
|
96
|
+
"input_is_fast5": len(fast5_paths) > 0,
|
|
97
|
+
"input_is_fastq": len(fastq_paths) > 0,
|
|
98
|
+
"input_is_bam": len(bam_paths) > 0,
|
|
99
|
+
"all_files_searched": total_searched,
|
|
100
|
+
}
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
def extract_base_identities(bam_file, chromosome, positions, max_reference_length, sequence):
|
|
2
|
+
"""
|
|
3
|
+
Efficiently extracts base identities from mapped reads with reference coordinates.
|
|
4
|
+
|
|
5
|
+
Parameters:
|
|
6
|
+
bam_file (str): Path to the BAM file.
|
|
7
|
+
chromosome (str): Name of the reference chromosome.
|
|
8
|
+
positions (list): Positions to extract (0-based).
|
|
9
|
+
max_reference_length (int): Maximum reference length for padding.
|
|
10
|
+
sequence (str): The sequence of the record fasta
|
|
11
|
+
|
|
12
|
+
Returns:
|
|
13
|
+
dict: Base identities from forward mapped reads.
|
|
14
|
+
dict: Base identities from reverse mapped reads.
|
|
15
|
+
"""
|
|
16
|
+
import pysam
|
|
17
|
+
import numpy as np
|
|
18
|
+
from collections import defaultdict
|
|
19
|
+
import time
|
|
20
|
+
from collections import defaultdict, Counter
|
|
21
|
+
|
|
22
|
+
timestamp = time.strftime("[%Y-%m-%d %H:%M:%S]")
|
|
23
|
+
|
|
24
|
+
positions = set(positions)
|
|
25
|
+
fwd_base_identities = defaultdict(lambda: np.full(max_reference_length, 'N', dtype='<U1'))
|
|
26
|
+
rev_base_identities = defaultdict(lambda: np.full(max_reference_length, 'N', dtype='<U1'))
|
|
27
|
+
mismatch_counts_per_read = defaultdict(lambda: defaultdict(Counter))
|
|
28
|
+
|
|
29
|
+
#print(f"{timestamp} Reading reads from {chromosome} BAM file: {bam_file}")
|
|
30
|
+
with pysam.AlignmentFile(bam_file, "rb") as bam:
|
|
31
|
+
total_reads = bam.mapped
|
|
32
|
+
ref_seq = sequence.upper()
|
|
33
|
+
for read in bam.fetch(chromosome):
|
|
34
|
+
if not read.is_mapped:
|
|
35
|
+
continue # Skip unmapped reads
|
|
36
|
+
|
|
37
|
+
read_name = read.query_name
|
|
38
|
+
query_sequence = read.query_sequence
|
|
39
|
+
base_dict = rev_base_identities if read.is_reverse else fwd_base_identities
|
|
40
|
+
|
|
41
|
+
# Use get_aligned_pairs directly with positions filtering
|
|
42
|
+
aligned_pairs = read.get_aligned_pairs(matches_only=True)
|
|
43
|
+
|
|
44
|
+
for read_position, reference_position in aligned_pairs:
|
|
45
|
+
if reference_position in positions:
|
|
46
|
+
read_base = query_sequence[read_position]
|
|
47
|
+
ref_base = ref_seq[reference_position]
|
|
48
|
+
|
|
49
|
+
base_dict[read_name][reference_position] = read_base
|
|
50
|
+
|
|
51
|
+
# Track mismatches (excluding Ns)
|
|
52
|
+
if read_base != ref_base and read_base != 'N' and ref_base != 'N':
|
|
53
|
+
mismatch_counts_per_read[read_name][ref_base][read_base] += 1
|
|
54
|
+
|
|
55
|
+
# Determine C→T vs G→A dominance per read
|
|
56
|
+
mismatch_trend_per_read = {}
|
|
57
|
+
for read_name, ref_dict in mismatch_counts_per_read.items():
|
|
58
|
+
c_to_t = ref_dict.get("C", {}).get("T", 0)
|
|
59
|
+
g_to_a = ref_dict.get("G", {}).get("A", 0)
|
|
60
|
+
|
|
61
|
+
if abs(c_to_t - g_to_a) < 0.01 and c_to_t > 0:
|
|
62
|
+
mismatch_trend_per_read[read_name] = "equal"
|
|
63
|
+
elif c_to_t > g_to_a:
|
|
64
|
+
mismatch_trend_per_read[read_name] = "C->T"
|
|
65
|
+
elif g_to_a > c_to_t:
|
|
66
|
+
mismatch_trend_per_read[read_name] = "G->A"
|
|
67
|
+
else:
|
|
68
|
+
mismatch_trend_per_read[read_name] = "none"
|
|
69
|
+
|
|
70
|
+
return dict(fwd_base_identities), dict(rev_base_identities), dict(mismatch_counts_per_read), mismatch_trend_per_read
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
## extract_mods
|
|
2
|
+
|
|
3
|
+
def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassified=True, modkit_summary=False, threads=None):
|
|
4
|
+
"""
|
|
5
|
+
Takes all of the aligned, sorted, split modified BAM files and runs Nanopore Modkit Extract to load the modification data into zipped TSV files
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
thresholds (list): A list of thresholds to use for marking each basecalled base as passing or failing on canonical and modification call status.
|
|
9
|
+
mod_tsv_dir (str): A string representing the file path to the directory to hold the modkit extract outputs.
|
|
10
|
+
split_dit (str): A string representing the file path to the directory containing the converted aligned_sorted_split BAM files.
|
|
11
|
+
bam_suffix (str): The suffix to use for the BAM file.
|
|
12
|
+
skip_unclassified (bool): Whether to skip unclassified bam file for modkit extract command
|
|
13
|
+
modkit_summary (bool): Whether to run and display modkit summary
|
|
14
|
+
threads (int): Number of threads to use
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
None
|
|
18
|
+
Runs modkit extract on input aligned_sorted_split modified BAM files to output zipped TSVs containing modification calls.
|
|
19
|
+
|
|
20
|
+
"""
|
|
21
|
+
import os
|
|
22
|
+
import subprocess
|
|
23
|
+
import glob
|
|
24
|
+
import zipfile
|
|
25
|
+
|
|
26
|
+
os.chdir(mod_tsv_dir)
|
|
27
|
+
filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
|
|
28
|
+
bam_files = glob.glob(os.path.join(split_dir, f"*{bam_suffix}"))
|
|
29
|
+
|
|
30
|
+
if threads:
|
|
31
|
+
threads = str(threads)
|
|
32
|
+
else:
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
for input_file in bam_files:
|
|
36
|
+
print(input_file)
|
|
37
|
+
# Extract the file basename
|
|
38
|
+
file_name = os.path.basename(input_file)
|
|
39
|
+
if skip_unclassified and "unclassified" in file_name:
|
|
40
|
+
print("Skipping modkit extract on unclassified reads")
|
|
41
|
+
else:
|
|
42
|
+
# Construct the output TSV file path
|
|
43
|
+
output_tsv_temp = os.path.join(mod_tsv_dir, file_name)
|
|
44
|
+
output_tsv = output_tsv_temp.replace(bam_suffix, "") + "_extract.tsv"
|
|
45
|
+
if os.path.exists(f"{output_tsv}.gz"):
|
|
46
|
+
print(f"{output_tsv}.gz already exists, skipping modkit extract")
|
|
47
|
+
else:
|
|
48
|
+
print(f"Extracting modification data from {input_file}")
|
|
49
|
+
if modkit_summary:
|
|
50
|
+
# Run modkit summary
|
|
51
|
+
subprocess.run(["modkit", "summary", input_file])
|
|
52
|
+
else:
|
|
53
|
+
pass
|
|
54
|
+
# Run modkit extract
|
|
55
|
+
if threads:
|
|
56
|
+
extract_command = [
|
|
57
|
+
"modkit", "extract",
|
|
58
|
+
"calls", "--mapped-only",
|
|
59
|
+
"--filter-threshold", f'{filter_threshold}',
|
|
60
|
+
"--mod-thresholds", f"m:{m5C_threshold}",
|
|
61
|
+
"--mod-thresholds", f"a:{m6A_threshold}",
|
|
62
|
+
"--mod-thresholds", f"h:{hm5C_threshold}",
|
|
63
|
+
"-t", threads,
|
|
64
|
+
input_file, output_tsv
|
|
65
|
+
]
|
|
66
|
+
else:
|
|
67
|
+
extract_command = [
|
|
68
|
+
"modkit", "extract",
|
|
69
|
+
"calls", "--mapped-only",
|
|
70
|
+
"--filter-threshold", f'{filter_threshold}',
|
|
71
|
+
"--mod-thresholds", f"m:{m5C_threshold}",
|
|
72
|
+
"--mod-thresholds", f"a:{m6A_threshold}",
|
|
73
|
+
"--mod-thresholds", f"h:{hm5C_threshold}",
|
|
74
|
+
input_file, output_tsv
|
|
75
|
+
]
|
|
76
|
+
subprocess.run(extract_command)
|
|
77
|
+
# Zip the output TSV
|
|
78
|
+
print(f'zipping {output_tsv}')
|
|
79
|
+
if threads:
|
|
80
|
+
zip_command = ["pigz", "-f", "-p", threads, output_tsv]
|
|
81
|
+
else:
|
|
82
|
+
zip_command = ["pigz", "-f", output_tsv]
|
|
83
|
+
subprocess.run(zip_command, check=True)
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# extract_read_features_from_bam
|
|
2
|
+
|
|
3
|
+
def extract_read_features_from_bam(bam_file_path):
|
|
4
|
+
"""
|
|
5
|
+
Make a dict of reads from a bam that points to a list of read metrics: read length, read median Q-score, reference length, mapped length, mapping quality
|
|
6
|
+
Params:
|
|
7
|
+
bam_file_path (str):
|
|
8
|
+
Returns:
|
|
9
|
+
read_metrics (dict)
|
|
10
|
+
"""
|
|
11
|
+
import pysam
|
|
12
|
+
import numpy as np
|
|
13
|
+
# Open the BAM file
|
|
14
|
+
print(f'Extracting read features from BAM: {bam_file_path}')
|
|
15
|
+
with pysam.AlignmentFile(bam_file_path, "rb") as bam_file:
|
|
16
|
+
read_metrics = {}
|
|
17
|
+
reference_lengths = bam_file.lengths # List of lengths for each reference (chromosome)
|
|
18
|
+
for read in bam_file:
|
|
19
|
+
# Skip unmapped reads
|
|
20
|
+
if read.is_unmapped:
|
|
21
|
+
continue
|
|
22
|
+
# Extract the read metrics
|
|
23
|
+
read_quality = read.query_qualities
|
|
24
|
+
median_read_quality = np.median(read_quality)
|
|
25
|
+
# Extract the reference (chromosome) name and its length
|
|
26
|
+
reference_name = read.reference_name
|
|
27
|
+
reference_index = bam_file.references.index(reference_name)
|
|
28
|
+
reference_length = reference_lengths[reference_index]
|
|
29
|
+
mapped_length = sum(end - start for start, end in read.get_blocks())
|
|
30
|
+
mapping_quality = read.mapping_quality # Phred-scaled MAPQ
|
|
31
|
+
read_metrics[read.query_name] = [read.query_length, median_read_quality, reference_length, mapped_length, mapping_quality]
|
|
32
|
+
|
|
33
|
+
return read_metrics
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# extract_read_lengths_from_bed
|
|
2
|
+
|
|
3
|
+
def extract_read_lengths_from_bed(file_path):
|
|
4
|
+
"""
|
|
5
|
+
Load a dict of read names that points to the read length
|
|
6
|
+
|
|
7
|
+
Params:
|
|
8
|
+
file_path (str): file path to a bed file
|
|
9
|
+
Returns:
|
|
10
|
+
read_dict (dict)
|
|
11
|
+
"""
|
|
12
|
+
import pandas as pd
|
|
13
|
+
columns = ['chrom', 'start', 'end', 'length', 'name']
|
|
14
|
+
df = pd.read_csv(file_path, sep='\t', header=None, names=columns, comment='#')
|
|
15
|
+
read_dict = {}
|
|
16
|
+
for _, row in df.iterrows():
|
|
17
|
+
chrom = row['chrom']
|
|
18
|
+
start = row['start']
|
|
19
|
+
end = row['end']
|
|
20
|
+
name = row['name']
|
|
21
|
+
length = row['length']
|
|
22
|
+
read_dict[name] = length
|
|
23
|
+
|
|
24
|
+
return read_dict
|
|
25
|
+
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# extract_readnames_from_BAM
|
|
2
|
+
|
|
3
|
+
def extract_readnames_from_BAM(aligned_BAM):
|
|
4
|
+
"""
|
|
5
|
+
Takes a BAM and writes out a txt file containing read names from the BAM
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
aligned_BAM (str): Path to an input aligned_BAM to extract read names from.
|
|
9
|
+
|
|
10
|
+
Returns:
|
|
11
|
+
None
|
|
12
|
+
|
|
13
|
+
"""
|
|
14
|
+
import subprocess
|
|
15
|
+
# Make a text file of reads for the BAM
|
|
16
|
+
txt_output = aligned_BAM.split('.bam')[0] + '_read_names.txt'
|
|
17
|
+
samtools_view = subprocess.Popen(["samtools", "view", aligned_BAM], stdout=subprocess.PIPE)
|
|
18
|
+
with open(txt_output, "w") as output_file:
|
|
19
|
+
cut_process = subprocess.Popen(["cut", "-f1"], stdin=samtools_view.stdout, stdout=output_file)
|
|
20
|
+
samtools_view.stdout.close()
|
|
21
|
+
cut_process.wait()
|
|
22
|
+
samtools_view.wait()
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
def find_conversion_sites(fasta_file, modification_type, conversions, deaminase_footprinting=False):
|
|
2
|
+
"""
|
|
3
|
+
Finds genomic coordinates of modified bases (5mC or 6mA) in a reference FASTA file.
|
|
4
|
+
|
|
5
|
+
Parameters:
|
|
6
|
+
fasta_file (str): Path to the converted reference FASTA.
|
|
7
|
+
modification_type (str): Modification type ('5mC' or '6mA') or 'unconverted'.
|
|
8
|
+
conversions (list): List of conversion types. The first element is the unconverted record type.
|
|
9
|
+
deaminase_footprinting (bool): Whether the footprinting was done with a direct deamination chemistry.
|
|
10
|
+
|
|
11
|
+
Returns:
|
|
12
|
+
dict: Dictionary where keys are **both unconverted & converted record names**.
|
|
13
|
+
Values contain:
|
|
14
|
+
[sequence length, top strand coordinates, bottom strand coordinates, sequence, complement sequence].
|
|
15
|
+
"""
|
|
16
|
+
import numpy as np
|
|
17
|
+
from Bio import SeqIO
|
|
18
|
+
unconverted = conversions[0]
|
|
19
|
+
record_dict = {}
|
|
20
|
+
|
|
21
|
+
# Define base mapping based on modification type
|
|
22
|
+
base_mappings = {
|
|
23
|
+
'5mC': ('C', 'G'), # Cytosine and Guanine
|
|
24
|
+
'6mA': ('A', 'T') # Adenine and Thymine
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
# Read FASTA file and process records
|
|
28
|
+
with open(fasta_file, "r") as f:
|
|
29
|
+
for record in SeqIO.parse(f, "fasta"):
|
|
30
|
+
if unconverted in record.id or deaminase_footprinting:
|
|
31
|
+
sequence = str(record.seq).upper()
|
|
32
|
+
complement = str(record.seq.complement()).upper()
|
|
33
|
+
sequence_length = len(sequence)
|
|
34
|
+
|
|
35
|
+
# Unconverted case: store the full sequence without coordinate filtering
|
|
36
|
+
if modification_type == unconverted:
|
|
37
|
+
record_dict[record.id] = [sequence_length, [], [], sequence, complement]
|
|
38
|
+
|
|
39
|
+
# Process converted records: extract modified base positions
|
|
40
|
+
elif modification_type in base_mappings:
|
|
41
|
+
top_base, bottom_base = base_mappings[modification_type]
|
|
42
|
+
seq_array = np.array(list(sequence))
|
|
43
|
+
top_strand_coordinates = np.where(seq_array == top_base)[0].tolist()
|
|
44
|
+
bottom_strand_coordinates = np.where(seq_array == bottom_base)[0].tolist()
|
|
45
|
+
|
|
46
|
+
record_dict[record.id] = [sequence_length, top_strand_coordinates, bottom_strand_coordinates, sequence, complement]
|
|
47
|
+
|
|
48
|
+
else:
|
|
49
|
+
raise ValueError(f"Invalid modification_type: {modification_type}. Choose '5mC', '6mA', or 'unconverted'.")
|
|
50
|
+
|
|
51
|
+
return record_dict
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import gzip
|
|
3
|
+
import os
|
|
4
|
+
from Bio import SeqIO
|
|
5
|
+
from Bio.SeqRecord import SeqRecord
|
|
6
|
+
from Bio.Seq import Seq
|
|
7
|
+
from concurrent.futures import ProcessPoolExecutor
|
|
8
|
+
from itertools import chain
|
|
9
|
+
|
|
10
|
+
def convert_FASTA_record(record, modification_type, strand, unconverted):
|
|
11
|
+
""" Converts a FASTA record based on modification type and strand. """
|
|
12
|
+
conversion_maps = {
|
|
13
|
+
('5mC', 'top'): ('C', 'T'),
|
|
14
|
+
('5mC', 'bottom'): ('G', 'A'),
|
|
15
|
+
('6mA', 'top'): ('A', 'G'),
|
|
16
|
+
('6mA', 'bottom'): ('T', 'C')
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
sequence = str(record.seq).upper()
|
|
20
|
+
|
|
21
|
+
if modification_type == unconverted:
|
|
22
|
+
return SeqRecord(Seq(sequence), id=f"{record.id}_{modification_type}_top", description=record.description)
|
|
23
|
+
|
|
24
|
+
if (modification_type, strand) not in conversion_maps:
|
|
25
|
+
raise ValueError(f"Invalid combination: {modification_type}, {strand}")
|
|
26
|
+
|
|
27
|
+
original_base, converted_base = conversion_maps[(modification_type, strand)]
|
|
28
|
+
new_seq = sequence.replace(original_base, converted_base)
|
|
29
|
+
|
|
30
|
+
return SeqRecord(Seq(new_seq), id=f"{record.id}_{modification_type}_{strand}", description=record.description)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def process_fasta_record(args):
|
|
34
|
+
"""
|
|
35
|
+
Processes a single FASTA record for parallel execution.
|
|
36
|
+
Args:
|
|
37
|
+
args (tuple): (record, modification_types, strands, unconverted)
|
|
38
|
+
Returns:
|
|
39
|
+
list of modified SeqRecord objects.
|
|
40
|
+
"""
|
|
41
|
+
record, modification_types, strands, unconverted = args
|
|
42
|
+
modified_records = []
|
|
43
|
+
|
|
44
|
+
for modification_type in modification_types:
|
|
45
|
+
for i, strand in enumerate(strands):
|
|
46
|
+
if i > 0 and modification_type == unconverted:
|
|
47
|
+
continue # Ensure unconverted is added only once
|
|
48
|
+
|
|
49
|
+
modified_records.append(convert_FASTA_record(record, modification_type, strand, unconverted))
|
|
50
|
+
|
|
51
|
+
return modified_records
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def generate_converted_FASTA(input_fasta, modification_types, strands, output_fasta, num_threads=4, chunk_size=500):
|
|
55
|
+
"""
|
|
56
|
+
Converts an input FASTA file and writes a new converted FASTA file efficiently.
|
|
57
|
+
|
|
58
|
+
Parameters:
|
|
59
|
+
input_fasta (str): Path to the unconverted FASTA file.
|
|
60
|
+
modification_types (list): List of modification types ('5mC', '6mA', or unconverted).
|
|
61
|
+
strands (list): List of strands ('top', 'bottom').
|
|
62
|
+
output_fasta (str): Path to the converted FASTA output file.
|
|
63
|
+
num_threads (int): Number of parallel threads to use.
|
|
64
|
+
chunk_size (int): Number of records to process per write batch.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
None (Writes the converted FASTA file).
|
|
68
|
+
"""
|
|
69
|
+
unconverted = modification_types[0]
|
|
70
|
+
|
|
71
|
+
# Detect if input is gzipped
|
|
72
|
+
open_func = gzip.open if input_fasta.endswith('.gz') else open
|
|
73
|
+
file_mode = 'rt' if input_fasta.endswith('.gz') else 'r'
|
|
74
|
+
|
|
75
|
+
def fasta_record_generator():
|
|
76
|
+
""" Lazily yields FASTA records from file. """
|
|
77
|
+
with open_func(input_fasta, file_mode) as handle:
|
|
78
|
+
for record in SeqIO.parse(handle, 'fasta'):
|
|
79
|
+
yield record
|
|
80
|
+
|
|
81
|
+
with open(output_fasta, 'w') as output_handle, ProcessPoolExecutor(max_workers=num_threads) as executor:
|
|
82
|
+
# Process records in parallel using a named function (avoiding lambda)
|
|
83
|
+
results = executor.map(
|
|
84
|
+
process_fasta_record,
|
|
85
|
+
((record, modification_types, strands, unconverted) for record in fasta_record_generator())
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
buffer = []
|
|
89
|
+
for modified_records in results:
|
|
90
|
+
buffer.extend(modified_records)
|
|
91
|
+
|
|
92
|
+
# Write out in chunks to save memory
|
|
93
|
+
if len(buffer) >= chunk_size:
|
|
94
|
+
SeqIO.write(buffer, output_handle, 'fasta')
|
|
95
|
+
buffer.clear()
|
|
96
|
+
|
|
97
|
+
# Write any remaining records
|
|
98
|
+
if buffer:
|
|
99
|
+
SeqIO.write(buffer, output_handle, 'fasta')
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# get_chromosome_lengths
|
|
2
|
+
|
|
3
|
+
def get_chromosome_lengths(fasta):
|
|
4
|
+
"""
|
|
5
|
+
Generates a file containing chromosome lengths within an input FASTA.
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
fasta (str): Path to the input fasta
|
|
9
|
+
"""
|
|
10
|
+
import os
|
|
11
|
+
import subprocess
|
|
12
|
+
from .index_fasta import index_fasta
|
|
13
|
+
|
|
14
|
+
# Make a fasta index file if one isn't already available
|
|
15
|
+
index_path = f'{fasta}.fai'
|
|
16
|
+
if os.path.exists(index_path):
|
|
17
|
+
print(f'Using existing fasta index file: {index_path}')
|
|
18
|
+
else:
|
|
19
|
+
index_fasta(fasta)
|
|
20
|
+
|
|
21
|
+
parent_dir = os.path.dirname(fasta)
|
|
22
|
+
fasta_basename = os.path.basename(fasta)
|
|
23
|
+
chrom_basename = fasta_basename.split('.fa')[0] + '.chrom.sizes'
|
|
24
|
+
chrom_path = os.path.join(parent_dir, chrom_basename)
|
|
25
|
+
|
|
26
|
+
# Make a chromosome length file
|
|
27
|
+
if os.path.exists(chrom_path):
|
|
28
|
+
print(f'Using existing chrom length index file: {chrom_path}')
|
|
29
|
+
else:
|
|
30
|
+
with open(chrom_path, 'w') as outfile:
|
|
31
|
+
command = ["cut", "-f1,2", index_path]
|
|
32
|
+
subprocess.run(command, stdout=outfile)
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
## get_native_references
|
|
2
|
+
|
|
3
|
+
# Direct methylation specific
|
|
4
|
+
def get_native_references(fasta_file):
|
|
5
|
+
"""
|
|
6
|
+
Makes a dictionary keyed by record id which points to the record length and record sequence.
|
|
7
|
+
|
|
8
|
+
Paramaters:
|
|
9
|
+
fasta_file (str): A string representing the path to the FASTA file for the experiment.
|
|
10
|
+
|
|
11
|
+
Returns:
|
|
12
|
+
None
|
|
13
|
+
"""
|
|
14
|
+
from .. import readwrite
|
|
15
|
+
from Bio import SeqIO
|
|
16
|
+
from Bio.SeqRecord import SeqRecord
|
|
17
|
+
from Bio.Seq import Seq
|
|
18
|
+
record_dict = {}
|
|
19
|
+
print('{0}: Opening FASTA file {1}'.format(readwrite.time_string(), fasta_file))
|
|
20
|
+
# Open the FASTA record as read only
|
|
21
|
+
with open(fasta_file, "r") as f:
|
|
22
|
+
# Iterate over records in the FASTA
|
|
23
|
+
for record in SeqIO.parse(f, "fasta"):
|
|
24
|
+
# Extract the sequence string of the record
|
|
25
|
+
sequence = str(record.seq).upper()
|
|
26
|
+
sequence_length = len(sequence)
|
|
27
|
+
record_dict[record.id] = [sequence_length, sequence]
|
|
28
|
+
return record_dict
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# index_fasta
|
|
2
|
+
|
|
3
|
+
def index_fasta(fasta):
|
|
4
|
+
"""
|
|
5
|
+
Generate a FASTA index file for an input fasta.
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
fasta (str): Path to the input fasta to make an index file for.
|
|
9
|
+
"""
|
|
10
|
+
import subprocess
|
|
11
|
+
|
|
12
|
+
subprocess.run(["samtools", "faidx", fasta])
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
## make_dirs
|
|
2
|
+
|
|
3
|
+
# General
|
|
4
|
+
def make_dirs(directories):
|
|
5
|
+
"""
|
|
6
|
+
Takes a list of file paths and makes new directories if the directory does not already exist.
|
|
7
|
+
|
|
8
|
+
Parameters:
|
|
9
|
+
directories (list): A list of directories to make
|
|
10
|
+
|
|
11
|
+
Returns:
|
|
12
|
+
None
|
|
13
|
+
"""
|
|
14
|
+
import os
|
|
15
|
+
|
|
16
|
+
for directory in directories:
|
|
17
|
+
if not os.path.isdir(directory):
|
|
18
|
+
os.mkdir(directory)
|
|
19
|
+
print(f"Directory '{directory}' created successfully.")
|
|
20
|
+
else:
|
|
21
|
+
print(f"Directory '{directory}' already exists.")
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
## make_modbed
|
|
2
|
+
|
|
3
|
+
# Direct SMF
|
|
4
|
+
def make_modbed(aligned_sorted_output, thresholds, mod_bed_dir):
|
|
5
|
+
"""
|
|
6
|
+
Generating position methylation summaries for each barcoded sample starting from the overall BAM file that was direct output of dorado aligner.
|
|
7
|
+
Parameters:
|
|
8
|
+
aligned_sorted_output (str): A string representing the file path to the aligned_sorted non-split BAM file.
|
|
9
|
+
|
|
10
|
+
Returns:
|
|
11
|
+
None
|
|
12
|
+
"""
|
|
13
|
+
import os
|
|
14
|
+
import subprocess
|
|
15
|
+
|
|
16
|
+
os.chdir(mod_bed_dir)
|
|
17
|
+
filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
|
|
18
|
+
command = [
|
|
19
|
+
"modkit", "pileup", aligned_sorted_output, mod_bed_dir,
|
|
20
|
+
"--partition-tag", "BC",
|
|
21
|
+
"--only-tabs",
|
|
22
|
+
"--filter-threshold", f'{filter_threshold}',
|
|
23
|
+
"--mod-thresholds", f"m:{m5C_threshold}",
|
|
24
|
+
"--mod-thresholds", f"a:{m6A_threshold}",
|
|
25
|
+
"--mod-thresholds", f"h:{hm5C_threshold}"
|
|
26
|
+
]
|
|
27
|
+
subprocess.run(command)
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
## modQC
|
|
2
|
+
|
|
3
|
+
# Direct SMF
|
|
4
|
+
def modQC(aligned_sorted_output, thresholds):
|
|
5
|
+
"""
|
|
6
|
+
Output the percentile of bases falling at a call threshold (threshold is a probability between 0-1) for the overall BAM file.
|
|
7
|
+
It is generally good to look at these parameters on positive and negative controls.
|
|
8
|
+
|
|
9
|
+
Parameters:
|
|
10
|
+
aligned_sorted_output (str): A string representing the file path of the aligned_sorted non-split BAM file output by the dorado aligned.
|
|
11
|
+
thresholds (list): A list of floats to pass for call thresholds.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
None
|
|
15
|
+
"""
|
|
16
|
+
import subprocess
|
|
17
|
+
|
|
18
|
+
filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
|
|
19
|
+
subprocess.run(["modkit", "sample-probs", aligned_sorted_output])
|
|
20
|
+
command = [
|
|
21
|
+
"modkit", "summary", aligned_sorted_output,
|
|
22
|
+
"--filter-threshold", f"{filter_threshold}",
|
|
23
|
+
"--mod-thresholds", f"m:{m5C_threshold}",
|
|
24
|
+
"--mod-thresholds", f"a:{m6A_threshold}",
|
|
25
|
+
"--mod-thresholds", f"h:{hm5C_threshold}"
|
|
26
|
+
]
|
|
27
|
+
subprocess.run(command)
|