babappa 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- babappa-0.1.0/PKG-INFO +10 -0
- babappa-0.1.0/README.md +0 -0
- babappa-0.1.0/babappa/__init__.py +0 -0
- babappa-0.1.0/babappa/cli.py +28 -0
- babappa-0.1.0/babappa/workflow/scripts/ancestral_reconstruction.py +162 -0
- babappa-0.1.0/babappa/workflow/scripts/codon_align.py +104 -0
- babappa-0.1.0/babappa/workflow/scripts/dynamic_codeml.py +196 -0
- babappa-0.1.0/babappa/workflow/scripts/orthogroup.py +263 -0
- babappa-0.1.0/babappa/workflow/scripts/parse_absrel.py +68 -0
- babappa-0.1.0/babappa.egg-info/PKG-INFO +10 -0
- babappa-0.1.0/babappa.egg-info/SOURCES.txt +15 -0
- babappa-0.1.0/babappa.egg-info/dependency_links.txt +1 -0
- babappa-0.1.0/babappa.egg-info/entry_points.txt +2 -0
- babappa-0.1.0/babappa.egg-info/requires.txt +2 -0
- babappa-0.1.0/babappa.egg-info/top_level.txt +1 -0
- babappa-0.1.0/pyproject.toml +21 -0
- babappa-0.1.0/setup.cfg +4 -0
babappa-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: babappa
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Evolutionary selection pipeline integrating RBH, codon alignment, HyPhy, PAML, and ancestral reconstruction.
|
|
5
|
+
Author-email: Krishnendu Sinha <dr.krishnendusinha@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: biopython
|
|
10
|
+
Requires-Dist: snakemake
|
babappa-0.1.0/README.md
ADDED
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import subprocess
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
def main():
|
|
6
|
+
|
|
7
|
+
parser = argparse.ArgumentParser(prog="babappa")
|
|
8
|
+
parser.add_argument("--query", required=True)
|
|
9
|
+
parser.add_argument("--proteomes", required=True)
|
|
10
|
+
parser.add_argument("--cds", required=True)
|
|
11
|
+
parser.add_argument("--threads", type=int, default=4)
|
|
12
|
+
|
|
13
|
+
args = parser.parse_args()
|
|
14
|
+
|
|
15
|
+
workflow_dir = Path(__file__).parent / "workflow"
|
|
16
|
+
snakefile = workflow_dir / "Snakefile"
|
|
17
|
+
|
|
18
|
+
cmd = [
|
|
19
|
+
"snakemake",
|
|
20
|
+
"-s", str(snakefile),
|
|
21
|
+
"--cores", str(args.threads),
|
|
22
|
+
"--config",
|
|
23
|
+
f"query={args.query}",
|
|
24
|
+
f"proteomes={args.proteomes}",
|
|
25
|
+
f"cds={args.cds}"
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
subprocess.run(cmd, check=True)
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
import subprocess
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
import shutil
|
|
6
|
+
import re
|
|
7
|
+
from Bio.Seq import Seq
|
|
8
|
+
from Bio.SeqRecord import SeqRecord
|
|
9
|
+
from Bio import SeqIO
|
|
10
|
+
|
|
11
|
+
############################################################
|
|
12
|
+
# Snakemake inputs
|
|
13
|
+
############################################################
|
|
14
|
+
|
|
15
|
+
aln = Path(snakemake.input.aln).resolve()
|
|
16
|
+
tree = Path(snakemake.input.tree).resolve()
|
|
17
|
+
summary = Path(snakemake.input.summary).resolve()
|
|
18
|
+
|
|
19
|
+
output_fasta = Path(snakemake.output[0]).resolve()
|
|
20
|
+
|
|
21
|
+
base_dir = output_fasta.parent
|
|
22
|
+
base_dir.mkdir(parents=True, exist_ok=True)
|
|
23
|
+
|
|
24
|
+
############################################################
|
|
25
|
+
# Read significant branches
|
|
26
|
+
############################################################
|
|
27
|
+
|
|
28
|
+
significant_branches = []
|
|
29
|
+
|
|
30
|
+
with open(summary) as f:
|
|
31
|
+
next(f)
|
|
32
|
+
for line in f:
|
|
33
|
+
fields = line.strip().split("\t")
|
|
34
|
+
branch = fields[0]
|
|
35
|
+
significant = fields[5]
|
|
36
|
+
if significant == "YES":
|
|
37
|
+
significant_branches.append(branch)
|
|
38
|
+
|
|
39
|
+
if not significant_branches:
|
|
40
|
+
print("[INFO] No significant branches. No ASR performed.")
|
|
41
|
+
output_fasta.touch()
|
|
42
|
+
exit()
|
|
43
|
+
|
|
44
|
+
print(f"[INFO] Reconstructing ancestors for {len(significant_branches)} branches")
|
|
45
|
+
|
|
46
|
+
############################################################
|
|
47
|
+
# Utility
|
|
48
|
+
############################################################
|
|
49
|
+
|
|
50
|
+
def write_ctl(path, seqfile, treefile):
|
|
51
|
+
content = f"""
|
|
52
|
+
seqfile = {seqfile}
|
|
53
|
+
treefile = {treefile}
|
|
54
|
+
outfile = mlc
|
|
55
|
+
|
|
56
|
+
noisy = 0
|
|
57
|
+
verbose = 0
|
|
58
|
+
runmode = 0
|
|
59
|
+
|
|
60
|
+
seqtype = 1
|
|
61
|
+
CodonFreq = 2
|
|
62
|
+
|
|
63
|
+
model = 0
|
|
64
|
+
NSsites = 0
|
|
65
|
+
|
|
66
|
+
RateAncestor = 1
|
|
67
|
+
cleandata = 1
|
|
68
|
+
"""
|
|
69
|
+
path.write_text(content)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def extract_ancestral_from_rst(rst_file):
|
|
73
|
+
sequences = {}
|
|
74
|
+
capture = False
|
|
75
|
+
|
|
76
|
+
with open(rst_file) as f:
|
|
77
|
+
for line in f:
|
|
78
|
+
if "List of extant and reconstructed sequences" in line:
|
|
79
|
+
capture = True
|
|
80
|
+
continue
|
|
81
|
+
|
|
82
|
+
if capture:
|
|
83
|
+
if line.strip() == "":
|
|
84
|
+
break
|
|
85
|
+
|
|
86
|
+
parts = line.strip().split()
|
|
87
|
+
if len(parts) >= 2:
|
|
88
|
+
node = parts[0]
|
|
89
|
+
seq = parts[1]
|
|
90
|
+
sequences[node] = seq
|
|
91
|
+
|
|
92
|
+
return sequences
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
############################################################
|
|
96
|
+
# Perform ASR once (not per branch)
|
|
97
|
+
############################################################
|
|
98
|
+
|
|
99
|
+
workdir = base_dir / "ancestral_tmp"
|
|
100
|
+
workdir.mkdir(exist_ok=True)
|
|
101
|
+
|
|
102
|
+
shutil.copy(aln, workdir / "aln.phy")
|
|
103
|
+
shutil.copy(tree, workdir / "tree.nwk")
|
|
104
|
+
|
|
105
|
+
ctl_file = workdir / "asr.ctl"
|
|
106
|
+
write_ctl(ctl_file, "aln.phy", "tree.nwk")
|
|
107
|
+
|
|
108
|
+
print("[INFO] Running codeml ancestral reconstruction")
|
|
109
|
+
subprocess.run(["codeml", str(ctl_file)], cwd=workdir, check=True)
|
|
110
|
+
|
|
111
|
+
rst_file = workdir / "rst"
|
|
112
|
+
|
|
113
|
+
if not rst_file.exists():
|
|
114
|
+
raise RuntimeError("rst file not generated")
|
|
115
|
+
|
|
116
|
+
ancestral_sequences = extract_ancestral_from_rst(rst_file)
|
|
117
|
+
|
|
118
|
+
############################################################
|
|
119
|
+
# Map branches to ancestor node
|
|
120
|
+
############################################################
|
|
121
|
+
|
|
122
|
+
# Codeml internal nodes are numeric labels (e.g., 8, 9, 10...)
|
|
123
|
+
# We extract the parent node for each branch from the tree
|
|
124
|
+
|
|
125
|
+
with open(tree) as f:
|
|
126
|
+
newick = f.read()
|
|
127
|
+
|
|
128
|
+
records = []
|
|
129
|
+
|
|
130
|
+
for branch in significant_branches:
|
|
131
|
+
|
|
132
|
+
# Find pattern like: (ancestor,branch)
|
|
133
|
+
pattern = r"\(([^,()]+)," + re.escape(branch) + r"\)"
|
|
134
|
+
match = re.search(pattern, newick)
|
|
135
|
+
|
|
136
|
+
if not match:
|
|
137
|
+
continue
|
|
138
|
+
|
|
139
|
+
ancestor_node = match.group(1)
|
|
140
|
+
|
|
141
|
+
if ancestor_node in ancestral_sequences:
|
|
142
|
+
seq = ancestral_sequences[ancestor_node]
|
|
143
|
+
protein = Seq(seq).translate()
|
|
144
|
+
|
|
145
|
+
records.append(
|
|
146
|
+
SeqRecord(
|
|
147
|
+
protein,
|
|
148
|
+
id=f"Ancestor_of_{branch}",
|
|
149
|
+
description=""
|
|
150
|
+
)
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
############################################################
|
|
154
|
+
# Write FASTA
|
|
155
|
+
############################################################
|
|
156
|
+
|
|
157
|
+
if records:
|
|
158
|
+
SeqIO.write(records, output_fasta, "fasta")
|
|
159
|
+
else:
|
|
160
|
+
output_fasta.touch()
|
|
161
|
+
|
|
162
|
+
print("\n[INFO] Ancestral reconstruction completed.\n")
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
import subprocess
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
import shutil
|
|
6
|
+
import glob
|
|
7
|
+
|
|
8
|
+
############################################################
|
|
9
|
+
# Utility
|
|
10
|
+
############################################################
|
|
11
|
+
|
|
12
|
+
def run_cmd(cmd, cwd=None):
|
|
13
|
+
print("[RUN]", " ".join(cmd))
|
|
14
|
+
subprocess.run(cmd, check=True, cwd=cwd)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
############################################################
|
|
18
|
+
# Snakemake inputs
|
|
19
|
+
############################################################
|
|
20
|
+
|
|
21
|
+
cds_input = Path(snakemake.input.cds).resolve()
|
|
22
|
+
|
|
23
|
+
protein_aln_output = Path(snakemake.output.protein_aln).resolve()
|
|
24
|
+
cds_aln_output = Path(snakemake.output.cds_aln).resolve()
|
|
25
|
+
|
|
26
|
+
threads = snakemake.threads
|
|
27
|
+
|
|
28
|
+
alignment_dir = protein_aln_output.parent.resolve()
|
|
29
|
+
alignment_dir.mkdir(parents=True, exist_ok=True)
|
|
30
|
+
|
|
31
|
+
############################################################
|
|
32
|
+
# Copy input CDS into alignment directory
|
|
33
|
+
############################################################
|
|
34
|
+
|
|
35
|
+
local_cds = alignment_dir / cds_input.name
|
|
36
|
+
shutil.copy(str(cds_input), str(local_cds))
|
|
37
|
+
|
|
38
|
+
############################################################
|
|
39
|
+
# Run babappalign in alignment directory
|
|
40
|
+
############################################################
|
|
41
|
+
|
|
42
|
+
print("\n=== RUNNING BABAPPALIGN (CODON MODE) ===")
|
|
43
|
+
|
|
44
|
+
run_cmd([
|
|
45
|
+
"babappalign",
|
|
46
|
+
local_cds.name,
|
|
47
|
+
"--mode", "codon",
|
|
48
|
+
"--model", "babappascore"
|
|
49
|
+
], cwd=alignment_dir)
|
|
50
|
+
|
|
51
|
+
############################################################
|
|
52
|
+
# Detect generated alignment files
|
|
53
|
+
############################################################
|
|
54
|
+
|
|
55
|
+
print("\n=== DETECTING BABAPPALIGN OUTPUT FILES ===")
|
|
56
|
+
|
|
57
|
+
fasta_files = list(alignment_dir.glob("*.fasta"))
|
|
58
|
+
|
|
59
|
+
if len(fasta_files) < 2:
|
|
60
|
+
raise RuntimeError("Babappalign did not produce expected FASTA outputs.")
|
|
61
|
+
|
|
62
|
+
codon_file = None
|
|
63
|
+
protein_file = None
|
|
64
|
+
|
|
65
|
+
for f in fasta_files:
|
|
66
|
+
name = f.name.lower()
|
|
67
|
+
|
|
68
|
+
if "codon" in name:
|
|
69
|
+
codon_file = f
|
|
70
|
+
elif "protein" in name or "pep" in name:
|
|
71
|
+
protein_file = f
|
|
72
|
+
|
|
73
|
+
# Fallback detection (if names don't contain keywords)
|
|
74
|
+
if codon_file is None or protein_file is None:
|
|
75
|
+
|
|
76
|
+
# Identify by sequence alphabet
|
|
77
|
+
from Bio import SeqIO
|
|
78
|
+
|
|
79
|
+
for f in fasta_files:
|
|
80
|
+
record = next(SeqIO.parse(f, "fasta"))
|
|
81
|
+
seq = str(record.seq)
|
|
82
|
+
|
|
83
|
+
if set(seq.upper()) <= set("ACGTN-"):
|
|
84
|
+
codon_file = f
|
|
85
|
+
else:
|
|
86
|
+
protein_file = f
|
|
87
|
+
|
|
88
|
+
if codon_file is None or protein_file is None:
|
|
89
|
+
raise RuntimeError("Could not determine codon vs protein alignment files.")
|
|
90
|
+
|
|
91
|
+
############################################################
|
|
92
|
+
# Move to final Snakemake outputs
|
|
93
|
+
############################################################
|
|
94
|
+
|
|
95
|
+
shutil.move(str(codon_file), str(cds_aln_output))
|
|
96
|
+
shutil.move(str(protein_file), str(protein_aln_output))
|
|
97
|
+
|
|
98
|
+
############################################################
|
|
99
|
+
# Cleanup temporary input copy
|
|
100
|
+
############################################################
|
|
101
|
+
|
|
102
|
+
local_cds.unlink()
|
|
103
|
+
|
|
104
|
+
print("\n✅ Codon alignment complete.")
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
import math
|
|
6
|
+
import subprocess
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from scipy.stats import chi2
|
|
9
|
+
|
|
10
|
+
############################################################
|
|
11
|
+
# Snakemake inputs
|
|
12
|
+
############################################################
|
|
13
|
+
|
|
14
|
+
alignment = Path(snakemake.input.aln).resolve()
|
|
15
|
+
tree_file = Path(snakemake.input.tree).resolve()
|
|
16
|
+
branches_file = Path(snakemake.input.branches).resolve()
|
|
17
|
+
|
|
18
|
+
output_flag = Path(snakemake.output[0]).resolve()
|
|
19
|
+
|
|
20
|
+
results_dir = output_flag.parent
|
|
21
|
+
results_dir.mkdir(parents=True, exist_ok=True)
|
|
22
|
+
|
|
23
|
+
############################################################
|
|
24
|
+
# Utilities
|
|
25
|
+
############################################################
|
|
26
|
+
|
|
27
|
+
def run_cmd(cmd, cwd=None):
|
|
28
|
+
print("[RUN]", " ".join(cmd))
|
|
29
|
+
subprocess.run(cmd, cwd=cwd, check=True)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def extract_lnL(mlc_file):
|
|
33
|
+
"""
|
|
34
|
+
Robust lnL extraction from PAML mlc file
|
|
35
|
+
"""
|
|
36
|
+
with open(mlc_file) as f:
|
|
37
|
+
for line in f:
|
|
38
|
+
if "lnL" in line:
|
|
39
|
+
matches = re.findall(r"-?\d+\.\d+", line)
|
|
40
|
+
if matches:
|
|
41
|
+
return float(matches[0])
|
|
42
|
+
raise RuntimeError(f"lnL not found in {mlc_file}")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def extract_beb_sites(mlc_file):
|
|
46
|
+
"""
|
|
47
|
+
Extract BEB sites with PP >= 0.95
|
|
48
|
+
"""
|
|
49
|
+
beb_sites = []
|
|
50
|
+
capture = False
|
|
51
|
+
|
|
52
|
+
with open(mlc_file) as f:
|
|
53
|
+
for line in f:
|
|
54
|
+
if "Bayes Empirical Bayes" in line:
|
|
55
|
+
capture = True
|
|
56
|
+
continue
|
|
57
|
+
|
|
58
|
+
if capture:
|
|
59
|
+
if line.strip() == "":
|
|
60
|
+
break
|
|
61
|
+
|
|
62
|
+
parts = line.strip().split()
|
|
63
|
+
if len(parts) >= 3:
|
|
64
|
+
try:
|
|
65
|
+
site = parts[0]
|
|
66
|
+
aa = parts[1]
|
|
67
|
+
prob = float(parts[-1].replace("*", ""))
|
|
68
|
+
if prob >= 0.95:
|
|
69
|
+
beb_sites.append((site, aa, prob))
|
|
70
|
+
except:
|
|
71
|
+
continue
|
|
72
|
+
|
|
73
|
+
return beb_sites
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def write_ctl(path, seqfile, treefile, outfile, fix_omega):
|
|
77
|
+
"""
|
|
78
|
+
Write branch-site control file
|
|
79
|
+
"""
|
|
80
|
+
with open(path, "w") as ctl:
|
|
81
|
+
ctl.write(f"""
|
|
82
|
+
seqfile = {seqfile}
|
|
83
|
+
treefile = {treefile}
|
|
84
|
+
outfile = {outfile}
|
|
85
|
+
|
|
86
|
+
noisy = 3
|
|
87
|
+
verbose = 1
|
|
88
|
+
runmode = 0
|
|
89
|
+
|
|
90
|
+
seqtype = 1
|
|
91
|
+
CodonFreq = 7
|
|
92
|
+
clock = 0
|
|
93
|
+
|
|
94
|
+
model = 2
|
|
95
|
+
NSsites = 2
|
|
96
|
+
|
|
97
|
+
icode = 0
|
|
98
|
+
fix_kappa = 0
|
|
99
|
+
kappa = 2
|
|
100
|
+
|
|
101
|
+
fix_omega = {fix_omega}
|
|
102
|
+
omega = 1.5
|
|
103
|
+
|
|
104
|
+
fix_alpha = 1
|
|
105
|
+
alpha = 0
|
|
106
|
+
Malpha = 0
|
|
107
|
+
|
|
108
|
+
ncatG = 4
|
|
109
|
+
getSE = 0
|
|
110
|
+
RateAncestor = 0
|
|
111
|
+
Small_Diff = 1e-6
|
|
112
|
+
""")
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
############################################################
|
|
116
|
+
# Read foreground branches
|
|
117
|
+
############################################################
|
|
118
|
+
|
|
119
|
+
with open(branches_file) as f:
|
|
120
|
+
branches = [line.strip() for line in f if line.strip()]
|
|
121
|
+
|
|
122
|
+
print(f"[INFO] Running codeml for {len(branches)} foreground branches")
|
|
123
|
+
|
|
124
|
+
summary_lines = []
|
|
125
|
+
summary_lines.append("Branch\tlnL_null\tlnL_alt\tLRT\tpvalue\tSignificant\tBEB_sites")
|
|
126
|
+
|
|
127
|
+
############################################################
|
|
128
|
+
# Run codeml per branch
|
|
129
|
+
############################################################
|
|
130
|
+
|
|
131
|
+
for branch in branches:
|
|
132
|
+
|
|
133
|
+
print(f"\n[INFO] Processing branch: {branch}")
|
|
134
|
+
|
|
135
|
+
workdir = results_dir / f"codeml_{branch}"
|
|
136
|
+
workdir.mkdir(exist_ok=True)
|
|
137
|
+
|
|
138
|
+
# Create branch-labeled tree
|
|
139
|
+
tree_text = open(tree_file).read()
|
|
140
|
+
labeled_tree = tree_text.replace(branch, branch + " #1")
|
|
141
|
+
labeled_tree_file = workdir / "tree.nwk"
|
|
142
|
+
labeled_tree_file.write_text(labeled_tree)
|
|
143
|
+
|
|
144
|
+
# Copy alignment
|
|
145
|
+
aln_copy = workdir / "alignment.phy"
|
|
146
|
+
subprocess.run(["cp", str(alignment), str(aln_copy)], check=True)
|
|
147
|
+
|
|
148
|
+
# Write control files
|
|
149
|
+
alt_ctl = workdir / "alt.ctl"
|
|
150
|
+
null_ctl = workdir / "null.ctl"
|
|
151
|
+
|
|
152
|
+
write_ctl(alt_ctl, aln_copy, labeled_tree_file, "alt.mlc", fix_omega=0)
|
|
153
|
+
write_ctl(null_ctl, aln_copy, labeled_tree_file, "null.mlc", fix_omega=1)
|
|
154
|
+
|
|
155
|
+
try:
|
|
156
|
+
run_cmd(["codeml", str(null_ctl)], cwd=workdir)
|
|
157
|
+
run_cmd(["codeml", str(alt_ctl)], cwd=workdir)
|
|
158
|
+
except subprocess.CalledProcessError:
|
|
159
|
+
print(f"[WARNING] codeml failed for branch {branch}")
|
|
160
|
+
continue
|
|
161
|
+
|
|
162
|
+
# Extract lnL
|
|
163
|
+
lnL_null = extract_lnL(workdir / "null.mlc")
|
|
164
|
+
lnL_alt = extract_lnL(workdir / "alt.mlc")
|
|
165
|
+
|
|
166
|
+
# Compute LRT
|
|
167
|
+
LRT = 2 * (lnL_alt - lnL_null)
|
|
168
|
+
|
|
169
|
+
# Mixture chi-square 0.5*χ²₁
|
|
170
|
+
pvalue = 0.5 * (1 - chi2.cdf(LRT, 1))
|
|
171
|
+
|
|
172
|
+
significant = "YES" if pvalue <= 0.05 else "NO"
|
|
173
|
+
|
|
174
|
+
beb_sites = extract_beb_sites(workdir / "alt.mlc")
|
|
175
|
+
|
|
176
|
+
beb_string = ";".join([f"{s}:{a}:{p:.3f}" for s,a,p in beb_sites])
|
|
177
|
+
|
|
178
|
+
summary_lines.append(
|
|
179
|
+
f"{branch}\t{lnL_null:.6f}\t{lnL_alt:.6f}\t{LRT:.4f}\t{pvalue:.6f}\t{significant}\t{beb_string}"
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
############################################################
|
|
183
|
+
# Write summary
|
|
184
|
+
############################################################
|
|
185
|
+
|
|
186
|
+
summary_file = results_dir / "codeml_branch_site_summary.tsv"
|
|
187
|
+
|
|
188
|
+
with open(summary_file, "w") as f:
|
|
189
|
+
f.write("\n".join(summary_lines))
|
|
190
|
+
|
|
191
|
+
print(f"\n[INFO] Summary written to {summary_file}")
|
|
192
|
+
|
|
193
|
+
# Touch done flag
|
|
194
|
+
output_flag.touch()
|
|
195
|
+
|
|
196
|
+
print("\n✅ Dynamic codeml complete.\n")
|
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
import subprocess
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from Bio import SeqIO
|
|
6
|
+
from Bio.SeqRecord import SeqRecord
|
|
7
|
+
|
|
8
|
+
############################################################
|
|
9
|
+
# Utility
|
|
10
|
+
############################################################
|
|
11
|
+
|
|
12
|
+
def run_cmd(cmd, cwd=None):
|
|
13
|
+
print("[RUN]", " ".join(cmd))
|
|
14
|
+
subprocess.run(cmd, check=True, cwd=cwd)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def make_blast_db(fasta, dbtype, db_dir):
|
|
18
|
+
fasta = Path(fasta).resolve()
|
|
19
|
+
db_dir = Path(db_dir).resolve()
|
|
20
|
+
db_dir.mkdir(parents=True, exist_ok=True)
|
|
21
|
+
|
|
22
|
+
prefix = db_dir / fasta.stem
|
|
23
|
+
index_file = prefix.with_suffix(".nin" if dbtype == "nucl" else ".pin")
|
|
24
|
+
|
|
25
|
+
if not index_file.exists():
|
|
26
|
+
run_cmd([
|
|
27
|
+
"makeblastdb",
|
|
28
|
+
"-in", str(fasta),
|
|
29
|
+
"-dbtype", dbtype,
|
|
30
|
+
"-out", str(prefix)
|
|
31
|
+
])
|
|
32
|
+
|
|
33
|
+
return str(prefix)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
############################################################
|
|
37
|
+
# Snakemake Inputs
|
|
38
|
+
############################################################
|
|
39
|
+
|
|
40
|
+
query_file = Path(snakemake.input.query).resolve()
|
|
41
|
+
proteome_dir = Path(snakemake.input.proteomes).resolve()
|
|
42
|
+
cds_dir = Path(snakemake.input.cds).resolve()
|
|
43
|
+
|
|
44
|
+
protein_output = Path(snakemake.output.proteins).resolve()
|
|
45
|
+
cds_output = Path(snakemake.output.cds).resolve()
|
|
46
|
+
|
|
47
|
+
threads = snakemake.threads
|
|
48
|
+
|
|
49
|
+
min_cov = 0.7
|
|
50
|
+
evalue_cutoff = 1e-5
|
|
51
|
+
identity_threshold = 99.0
|
|
52
|
+
|
|
53
|
+
results_dir = protein_output.parent
|
|
54
|
+
blast_db_dir = results_dir / "blastdb"
|
|
55
|
+
|
|
56
|
+
results_dir.mkdir(parents=True, exist_ok=True)
|
|
57
|
+
|
|
58
|
+
############################################################
|
|
59
|
+
# RBH STAGE
|
|
60
|
+
############################################################
|
|
61
|
+
|
|
62
|
+
print("\n=== RBH STAGE ===")
|
|
63
|
+
|
|
64
|
+
query_record = next(SeqIO.parse(query_file, "fasta"))
|
|
65
|
+
query_id = query_record.id
|
|
66
|
+
|
|
67
|
+
query_db = make_blast_db(query_file, "prot", blast_db_dir)
|
|
68
|
+
|
|
69
|
+
orthogroup_proteins = []
|
|
70
|
+
|
|
71
|
+
for species_path in proteome_dir.glob("*"):
|
|
72
|
+
|
|
73
|
+
if species_path.suffix not in [".faa", ".fa", ".fasta"]:
|
|
74
|
+
continue
|
|
75
|
+
|
|
76
|
+
species_path = species_path.resolve()
|
|
77
|
+
species_name = species_path.stem
|
|
78
|
+
|
|
79
|
+
print(f"[INFO] Processing {species_name}")
|
|
80
|
+
|
|
81
|
+
species_db = make_blast_db(species_path, "prot", blast_db_dir)
|
|
82
|
+
|
|
83
|
+
fwd_out = results_dir / f"{species_name}_fwd.tsv"
|
|
84
|
+
rev_out = results_dir / f"{species_name}_rev.tsv"
|
|
85
|
+
|
|
86
|
+
run_cmd([
|
|
87
|
+
"blastp",
|
|
88
|
+
"-query", str(query_file),
|
|
89
|
+
"-db", species_db,
|
|
90
|
+
"-out", str(fwd_out),
|
|
91
|
+
"-evalue", str(evalue_cutoff),
|
|
92
|
+
"-outfmt", "6 qseqid sseqid pident length qlen slen bitscore",
|
|
93
|
+
"-num_threads", str(threads)
|
|
94
|
+
])
|
|
95
|
+
|
|
96
|
+
run_cmd([
|
|
97
|
+
"blastp",
|
|
98
|
+
"-query", str(species_path),
|
|
99
|
+
"-db", query_db,
|
|
100
|
+
"-out", str(rev_out),
|
|
101
|
+
"-evalue", str(evalue_cutoff),
|
|
102
|
+
"-outfmt", "6 qseqid sseqid pident length qlen slen bitscore",
|
|
103
|
+
"-num_threads", str(threads)
|
|
104
|
+
])
|
|
105
|
+
|
|
106
|
+
def get_best_hits(blast_file):
|
|
107
|
+
best = {}
|
|
108
|
+
with open(blast_file) as f:
|
|
109
|
+
for line in f:
|
|
110
|
+
q, s, pident, length, qlen, slen, bitscore = line.strip().split()
|
|
111
|
+
length, qlen, slen = int(length), int(qlen), int(slen)
|
|
112
|
+
bitscore = float(bitscore)
|
|
113
|
+
|
|
114
|
+
if (length/qlen) >= min_cov and (length/slen) >= min_cov:
|
|
115
|
+
if q not in best or bitscore > best[q][1]:
|
|
116
|
+
best[q] = (s, bitscore)
|
|
117
|
+
return {q: s for q, (s, _) in best.items()}
|
|
118
|
+
|
|
119
|
+
fwd_hits = get_best_hits(fwd_out)
|
|
120
|
+
rev_hits = get_best_hits(rev_out)
|
|
121
|
+
|
|
122
|
+
rbh = {q: s for q, s in fwd_hits.items()
|
|
123
|
+
if s in rev_hits and rev_hits[s] == q}
|
|
124
|
+
|
|
125
|
+
if query_id not in rbh:
|
|
126
|
+
print(f"[WARN] No RBH for {species_name}")
|
|
127
|
+
continue
|
|
128
|
+
|
|
129
|
+
ortholog_id = rbh[query_id]
|
|
130
|
+
|
|
131
|
+
for record in SeqIO.parse(species_path, "fasta"):
|
|
132
|
+
if record.id == ortholog_id:
|
|
133
|
+
orthogroup_proteins.append(record)
|
|
134
|
+
break
|
|
135
|
+
|
|
136
|
+
SeqIO.write(orthogroup_proteins, protein_output, "fasta")
|
|
137
|
+
print(f"[INFO] Protein orthogroup written: {protein_output}")
|
|
138
|
+
|
|
139
|
+
############################################################
|
|
140
|
+
# MERGE TRANSCRIPTS INTO RESULTS
|
|
141
|
+
############################################################
|
|
142
|
+
|
|
143
|
+
print("\n=== MERGING TRANSCRIPTS ===")
|
|
144
|
+
|
|
145
|
+
merged_transcripts = results_dir / "all_transcripts.fasta"
|
|
146
|
+
|
|
147
|
+
with open(merged_transcripts, "w") as out:
|
|
148
|
+
for file in cds_dir.glob("*"):
|
|
149
|
+
if file.suffix in [".fa", ".fasta", ".fna"]:
|
|
150
|
+
for record in SeqIO.parse(file, "fasta"):
|
|
151
|
+
SeqIO.write(record, out, "fasta")
|
|
152
|
+
|
|
153
|
+
############################################################
|
|
154
|
+
# TRANSDECODER
|
|
155
|
+
############################################################
|
|
156
|
+
|
|
157
|
+
print("\n=== RUNNING TRANSDECODER ===")
|
|
158
|
+
|
|
159
|
+
run_cmd(
|
|
160
|
+
["TransDecoder.LongOrfs", "-t", str(merged_transcripts)],
|
|
161
|
+
cwd=str(results_dir)
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
run_cmd(
|
|
165
|
+
["TransDecoder.Predict", "-t", str(merged_transcripts), "--cpu", str(threads)],
|
|
166
|
+
cwd=str(results_dir)
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
predicted_cds = merged_transcripts.with_suffix(
|
|
170
|
+
merged_transcripts.suffix + ".transdecoder.cds"
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
if not predicted_cds.exists():
|
|
174
|
+
raise RuntimeError(f"TransDecoder CDS file not found: {predicted_cds}")
|
|
175
|
+
|
|
176
|
+
print(f"[INFO] Predicted CDS: {predicted_cds}")
|
|
177
|
+
|
|
178
|
+
############################################################
|
|
179
|
+
# BUILD BLAST DB
|
|
180
|
+
############################################################
|
|
181
|
+
|
|
182
|
+
cds_db = make_blast_db(predicted_cds, "nucl", blast_db_dir)
|
|
183
|
+
|
|
184
|
+
transcript_index = SeqIO.to_dict(
|
|
185
|
+
SeqIO.parse(predicted_cds, "fasta")
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
############################################################
|
|
189
|
+
# TBLASTN RECOVERY
|
|
190
|
+
############################################################
|
|
191
|
+
|
|
192
|
+
print("\n=== TBLASTN RECOVERY ===")
|
|
193
|
+
|
|
194
|
+
recovered_cds = []
|
|
195
|
+
|
|
196
|
+
for protein in orthogroup_proteins:
|
|
197
|
+
|
|
198
|
+
temp_query = results_dir / "temp_query.fasta"
|
|
199
|
+
SeqIO.write(protein, temp_query, "fasta")
|
|
200
|
+
|
|
201
|
+
blast_out = results_dir / "tblastn.tsv"
|
|
202
|
+
|
|
203
|
+
run_cmd([
|
|
204
|
+
"tblastn",
|
|
205
|
+
"-query", str(temp_query),
|
|
206
|
+
"-db", cds_db,
|
|
207
|
+
"-out", str(blast_out),
|
|
208
|
+
"-outfmt", "6 qseqid sseqid pident length qlen",
|
|
209
|
+
"-num_threads", str(threads)
|
|
210
|
+
])
|
|
211
|
+
|
|
212
|
+
with open(blast_out) as f:
|
|
213
|
+
for line in f:
|
|
214
|
+
q, s, pident, length, qlen = line.strip().split()
|
|
215
|
+
|
|
216
|
+
if float(pident) >= identity_threshold and int(length) == int(qlen):
|
|
217
|
+
cds_seq = transcript_index[s].seq
|
|
218
|
+
|
|
219
|
+
# Remove terminal stop codon if present
|
|
220
|
+
if cds_seq[-3:].upper() in ["TAA", "TAG", "TGA"]:
|
|
221
|
+
cds_seq = cds_seq[:-3]
|
|
222
|
+
|
|
223
|
+
recovered_cds.append(
|
|
224
|
+
SeqRecord(
|
|
225
|
+
cds_seq,
|
|
226
|
+
id=protein.id,
|
|
227
|
+
description=""
|
|
228
|
+
)
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
break
|
|
232
|
+
|
|
233
|
+
SeqIO.write(recovered_cds, cds_output, "fasta")
|
|
234
|
+
print(f"[INFO] CDS orthogroup written: {cds_output}")
|
|
235
|
+
|
|
236
|
+
############################################################
|
|
237
|
+
# CLEANUP EVERYTHING EXCEPT FINAL OUTPUTS
|
|
238
|
+
############################################################
|
|
239
|
+
|
|
240
|
+
print("\n=== CLEANING INTERMEDIATES ===")
|
|
241
|
+
|
|
242
|
+
keep = {protein_output.resolve(), cds_output.resolve()}
|
|
243
|
+
|
|
244
|
+
for item in results_dir.iterdir():
|
|
245
|
+
if item.resolve() in keep:
|
|
246
|
+
continue
|
|
247
|
+
if item.is_dir():
|
|
248
|
+
for sub in item.rglob("*"):
|
|
249
|
+
try:
|
|
250
|
+
sub.unlink()
|
|
251
|
+
except:
|
|
252
|
+
pass
|
|
253
|
+
try:
|
|
254
|
+
item.rmdir()
|
|
255
|
+
except:
|
|
256
|
+
pass
|
|
257
|
+
else:
|
|
258
|
+
try:
|
|
259
|
+
item.unlink()
|
|
260
|
+
except:
|
|
261
|
+
pass
|
|
262
|
+
|
|
263
|
+
print("\n✅ Orthogroup module complete.\n")
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from Bio import Phylo
|
|
6
|
+
|
|
7
|
+
############################################################
|
|
8
|
+
# Snakemake inputs
|
|
9
|
+
############################################################
|
|
10
|
+
|
|
11
|
+
absrel_json = Path(snakemake.input.json)
|
|
12
|
+
tree_file = Path(snakemake.input.tree)
|
|
13
|
+
output_file = Path(snakemake.output.branches)
|
|
14
|
+
|
|
15
|
+
############################################################
|
|
16
|
+
# Load absrel results
|
|
17
|
+
############################################################
|
|
18
|
+
|
|
19
|
+
with open(absrel_json) as f:
|
|
20
|
+
data = json.load(f)
|
|
21
|
+
|
|
22
|
+
selected_branches = []
|
|
23
|
+
|
|
24
|
+
# HyPhy structure
|
|
25
|
+
branch_data = data.get("branch attributes", {}).get("0", {})
|
|
26
|
+
|
|
27
|
+
for branch, info in branch_data.items():
|
|
28
|
+
|
|
29
|
+
# Try multiple possible keys
|
|
30
|
+
pval = (
|
|
31
|
+
info.get("Corrected P-value")
|
|
32
|
+
or info.get("p-value")
|
|
33
|
+
or info.get("p-value (corrected)")
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
if pval is None:
|
|
37
|
+
continue
|
|
38
|
+
|
|
39
|
+
try:
|
|
40
|
+
if float(pval) <= 0.05:
|
|
41
|
+
selected_branches.append(branch)
|
|
42
|
+
except:
|
|
43
|
+
continue
|
|
44
|
+
|
|
45
|
+
############################################################
|
|
46
|
+
# If none selected → fallback to all terminal branches
|
|
47
|
+
############################################################
|
|
48
|
+
|
|
49
|
+
if not selected_branches:
|
|
50
|
+
|
|
51
|
+
print("[INFO] No significant branches from absrel.")
|
|
52
|
+
print("[INFO] Falling back to all terminal branches.")
|
|
53
|
+
|
|
54
|
+
tree = Phylo.read(tree_file, "newick")
|
|
55
|
+
|
|
56
|
+
for clade in tree.get_terminals():
|
|
57
|
+
if clade.name:
|
|
58
|
+
selected_branches.append(clade.name)
|
|
59
|
+
|
|
60
|
+
############################################################
|
|
61
|
+
# Write output
|
|
62
|
+
############################################################
|
|
63
|
+
|
|
64
|
+
with open(output_file, "w") as out:
|
|
65
|
+
for branch in selected_branches:
|
|
66
|
+
out.write(branch + "\n")
|
|
67
|
+
|
|
68
|
+
print(f"[INFO] Foreground branches: {len(selected_branches)}")
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: babappa
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Evolutionary selection pipeline integrating RBH, codon alignment, HyPhy, PAML, and ancestral reconstruction.
|
|
5
|
+
Author-email: Krishnendu Sinha <dr.krishnendusinha@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: biopython
|
|
10
|
+
Requires-Dist: snakemake
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
babappa/__init__.py
|
|
4
|
+
babappa/cli.py
|
|
5
|
+
babappa.egg-info/PKG-INFO
|
|
6
|
+
babappa.egg-info/SOURCES.txt
|
|
7
|
+
babappa.egg-info/dependency_links.txt
|
|
8
|
+
babappa.egg-info/entry_points.txt
|
|
9
|
+
babappa.egg-info/requires.txt
|
|
10
|
+
babappa.egg-info/top_level.txt
|
|
11
|
+
babappa/workflow/scripts/ancestral_reconstruction.py
|
|
12
|
+
babappa/workflow/scripts/codon_align.py
|
|
13
|
+
babappa/workflow/scripts/dynamic_codeml.py
|
|
14
|
+
babappa/workflow/scripts/orthogroup.py
|
|
15
|
+
babappa/workflow/scripts/parse_absrel.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
babappa
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "babappa"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Evolutionary selection pipeline integrating RBH, codon alignment, HyPhy, PAML, and ancestral reconstruction."
|
|
9
|
+
authors = [
|
|
10
|
+
{name = "Krishnendu Sinha", email = "dr.krishnendusinha@gmail.com"}
|
|
11
|
+
]
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.9"
|
|
14
|
+
license = {text = "MIT"}
|
|
15
|
+
dependencies = [
|
|
16
|
+
"biopython",
|
|
17
|
+
"snakemake"
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
[project.scripts]
|
|
21
|
+
babappa = "babappa.cli:main"
|
babappa-0.1.0/setup.cfg
ADDED