babappa 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
babappa-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,10 @@
1
+ Metadata-Version: 2.4
2
+ Name: babappa
3
+ Version: 0.1.0
4
+ Summary: Evolutionary selection pipeline integrating RBH, codon alignment, HyPhy, PAML, and ancestral reconstruction.
5
+ Author-email: Krishnendu Sinha <dr.krishnendusinha@gmail.com>
6
+ License: MIT
7
+ Requires-Python: >=3.9
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: biopython
10
+ Requires-Dist: snakemake
File without changes
File without changes
@@ -0,0 +1,28 @@
1
+ import argparse
2
+ import subprocess
3
+ from pathlib import Path
4
+
5
+ def main():
6
+
7
+ parser = argparse.ArgumentParser(prog="babappa")
8
+ parser.add_argument("--query", required=True)
9
+ parser.add_argument("--proteomes", required=True)
10
+ parser.add_argument("--cds", required=True)
11
+ parser.add_argument("--threads", type=int, default=4)
12
+
13
+ args = parser.parse_args()
14
+
15
+ workflow_dir = Path(__file__).parent / "workflow"
16
+ snakefile = workflow_dir / "Snakefile"
17
+
18
+ cmd = [
19
+ "snakemake",
20
+ "-s", str(snakefile),
21
+ "--cores", str(args.threads),
22
+ "--config",
23
+ f"query={args.query}",
24
+ f"proteomes={args.proteomes}",
25
+ f"cds={args.cds}"
26
+ ]
27
+
28
+ subprocess.run(cmd, check=True)
@@ -0,0 +1,162 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import subprocess
4
+ from pathlib import Path
5
+ import shutil
6
+ import re
7
+ from Bio.Seq import Seq
8
+ from Bio.SeqRecord import SeqRecord
9
+ from Bio import SeqIO
10
+
11
+ ############################################################
12
+ # Snakemake inputs
13
+ ############################################################
14
+
15
+ aln = Path(snakemake.input.aln).resolve()
16
+ tree = Path(snakemake.input.tree).resolve()
17
+ summary = Path(snakemake.input.summary).resolve()
18
+
19
+ output_fasta = Path(snakemake.output[0]).resolve()
20
+
21
+ base_dir = output_fasta.parent
22
+ base_dir.mkdir(parents=True, exist_ok=True)
23
+
24
+ ############################################################
25
+ # Read significant branches
26
+ ############################################################
27
+
28
+ significant_branches = []
29
+
30
+ with open(summary) as f:
31
+ next(f)
32
+ for line in f:
33
+ fields = line.strip().split("\t")
34
+ branch = fields[0]
35
+ significant = fields[5]
36
+ if significant == "YES":
37
+ significant_branches.append(branch)
38
+
39
+ if not significant_branches:
40
+ print("[INFO] No significant branches. No ASR performed.")
41
+ output_fasta.touch()
42
+ exit()
43
+
44
+ print(f"[INFO] Reconstructing ancestors for {len(significant_branches)} branches")
45
+
46
+ ############################################################
47
+ # Utility
48
+ ############################################################
49
+
50
+ def write_ctl(path, seqfile, treefile):
51
+ content = f"""
52
+ seqfile = {seqfile}
53
+ treefile = {treefile}
54
+ outfile = mlc
55
+
56
+ noisy = 0
57
+ verbose = 0
58
+ runmode = 0
59
+
60
+ seqtype = 1
61
+ CodonFreq = 2
62
+
63
+ model = 0
64
+ NSsites = 0
65
+
66
+ RateAncestor = 1
67
+ cleandata = 1
68
+ """
69
+ path.write_text(content)
70
+
71
+
72
+ def extract_ancestral_from_rst(rst_file):
73
+ sequences = {}
74
+ capture = False
75
+
76
+ with open(rst_file) as f:
77
+ for line in f:
78
+ if "List of extant and reconstructed sequences" in line:
79
+ capture = True
80
+ continue
81
+
82
+ if capture:
83
+ if line.strip() == "":
84
+ break
85
+
86
+ parts = line.strip().split()
87
+ if len(parts) >= 2:
88
+ node = parts[0]
89
+ seq = parts[1]
90
+ sequences[node] = seq
91
+
92
+ return sequences
93
+
94
+
95
+ ############################################################
96
+ # Perform ASR once (not per branch)
97
+ ############################################################
98
+
99
+ workdir = base_dir / "ancestral_tmp"
100
+ workdir.mkdir(exist_ok=True)
101
+
102
+ shutil.copy(aln, workdir / "aln.phy")
103
+ shutil.copy(tree, workdir / "tree.nwk")
104
+
105
+ ctl_file = workdir / "asr.ctl"
106
+ write_ctl(ctl_file, "aln.phy", "tree.nwk")
107
+
108
+ print("[INFO] Running codeml ancestral reconstruction")
109
+ subprocess.run(["codeml", str(ctl_file)], cwd=workdir, check=True)
110
+
111
+ rst_file = workdir / "rst"
112
+
113
+ if not rst_file.exists():
114
+ raise RuntimeError("rst file not generated")
115
+
116
+ ancestral_sequences = extract_ancestral_from_rst(rst_file)
117
+
118
+ ############################################################
119
+ # Map branches to ancestor node
120
+ ############################################################
121
+
122
+ # Codeml internal nodes are numeric labels (e.g., 8, 9, 10...)
123
+ # We extract the parent node for each branch from the tree
124
+
125
+ with open(tree) as f:
126
+ newick = f.read()
127
+
128
+ records = []
129
+
130
+ for branch in significant_branches:
131
+
132
+ # Find pattern like: (ancestor,branch)
133
+ pattern = r"\(([^,()]+)," + re.escape(branch) + r"\)"
134
+ match = re.search(pattern, newick)
135
+
136
+ if not match:
137
+ continue
138
+
139
+ ancestor_node = match.group(1)
140
+
141
+ if ancestor_node in ancestral_sequences:
142
+ seq = ancestral_sequences[ancestor_node]
143
+ protein = Seq(seq).translate()
144
+
145
+ records.append(
146
+ SeqRecord(
147
+ protein,
148
+ id=f"Ancestor_of_{branch}",
149
+ description=""
150
+ )
151
+ )
152
+
153
+ ############################################################
154
+ # Write FASTA
155
+ ############################################################
156
+
157
+ if records:
158
+ SeqIO.write(records, output_fasta, "fasta")
159
+ else:
160
+ output_fasta.touch()
161
+
162
+ print("\n[INFO] Ancestral reconstruction completed.\n")
@@ -0,0 +1,104 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import subprocess
4
+ from pathlib import Path
5
+ import shutil
6
+ import glob
7
+
8
+ ############################################################
9
+ # Utility
10
+ ############################################################
11
+
12
+ def run_cmd(cmd, cwd=None):
13
+ print("[RUN]", " ".join(cmd))
14
+ subprocess.run(cmd, check=True, cwd=cwd)
15
+
16
+
17
+ ############################################################
18
+ # Snakemake inputs
19
+ ############################################################
20
+
21
+ cds_input = Path(snakemake.input.cds).resolve()
22
+
23
+ protein_aln_output = Path(snakemake.output.protein_aln).resolve()
24
+ cds_aln_output = Path(snakemake.output.cds_aln).resolve()
25
+
26
+ threads = snakemake.threads
27
+
28
+ alignment_dir = protein_aln_output.parent.resolve()
29
+ alignment_dir.mkdir(parents=True, exist_ok=True)
30
+
31
+ ############################################################
32
+ # Copy input CDS into alignment directory
33
+ ############################################################
34
+
35
+ local_cds = alignment_dir / cds_input.name
36
+ shutil.copy(str(cds_input), str(local_cds))
37
+
38
+ ############################################################
39
+ # Run babappalign in alignment directory
40
+ ############################################################
41
+
42
+ print("\n=== RUNNING BABAPPALIGN (CODON MODE) ===")
43
+
44
+ run_cmd([
45
+ "babappalign",
46
+ local_cds.name,
47
+ "--mode", "codon",
48
+ "--model", "babappascore"
49
+ ], cwd=alignment_dir)
50
+
51
+ ############################################################
52
+ # Detect generated alignment files
53
+ ############################################################
54
+
55
+ print("\n=== DETECTING BABAPPALIGN OUTPUT FILES ===")
56
+
57
+ fasta_files = list(alignment_dir.glob("*.fasta"))
58
+
59
+ if len(fasta_files) < 2:
60
+ raise RuntimeError("Babappalign did not produce expected FASTA outputs.")
61
+
62
+ codon_file = None
63
+ protein_file = None
64
+
65
+ for f in fasta_files:
66
+ name = f.name.lower()
67
+
68
+ if "codon" in name:
69
+ codon_file = f
70
+ elif "protein" in name or "pep" in name:
71
+ protein_file = f
72
+
73
+ # Fallback detection (if names don't contain keywords)
74
+ if codon_file is None or protein_file is None:
75
+
76
+ # Identify by sequence alphabet
77
+ from Bio import SeqIO
78
+
79
+ for f in fasta_files:
80
+ record = next(SeqIO.parse(f, "fasta"))
81
+ seq = str(record.seq)
82
+
83
+ if set(seq.upper()) <= set("ACGTN-"):
84
+ codon_file = f
85
+ else:
86
+ protein_file = f
87
+
88
+ if codon_file is None or protein_file is None:
89
+ raise RuntimeError("Could not determine codon vs protein alignment files.")
90
+
91
+ ############################################################
92
+ # Move to final Snakemake outputs
93
+ ############################################################
94
+
95
+ shutil.move(str(codon_file), str(cds_aln_output))
96
+ shutil.move(str(protein_file), str(protein_aln_output))
97
+
98
+ ############################################################
99
+ # Cleanup temporary input copy
100
+ ############################################################
101
+
102
+ local_cds.unlink()
103
+
104
+ print("\n✅ Codon alignment complete.")
@@ -0,0 +1,196 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import os
4
+ import re
5
+ import math
6
+ import subprocess
7
+ from pathlib import Path
8
+ from scipy.stats import chi2
9
+
10
+ ############################################################
11
+ # Snakemake inputs
12
+ ############################################################
13
+
14
+ alignment = Path(snakemake.input.aln).resolve()
15
+ tree_file = Path(snakemake.input.tree).resolve()
16
+ branches_file = Path(snakemake.input.branches).resolve()
17
+
18
+ output_flag = Path(snakemake.output[0]).resolve()
19
+
20
+ results_dir = output_flag.parent
21
+ results_dir.mkdir(parents=True, exist_ok=True)
22
+
23
+ ############################################################
24
+ # Utilities
25
+ ############################################################
26
+
27
+ def run_cmd(cmd, cwd=None):
28
+ print("[RUN]", " ".join(cmd))
29
+ subprocess.run(cmd, cwd=cwd, check=True)
30
+
31
+
32
+ def extract_lnL(mlc_file):
33
+ """
34
+ Robust lnL extraction from PAML mlc file
35
+ """
36
+ with open(mlc_file) as f:
37
+ for line in f:
38
+ if "lnL" in line:
39
+ matches = re.findall(r"-?\d+\.\d+", line)
40
+ if matches:
41
+ return float(matches[0])
42
+ raise RuntimeError(f"lnL not found in {mlc_file}")
43
+
44
+
45
+ def extract_beb_sites(mlc_file):
46
+ """
47
+ Extract BEB sites with PP >= 0.95
48
+ """
49
+ beb_sites = []
50
+ capture = False
51
+
52
+ with open(mlc_file) as f:
53
+ for line in f:
54
+ if "Bayes Empirical Bayes" in line:
55
+ capture = True
56
+ continue
57
+
58
+ if capture:
59
+ if line.strip() == "":
60
+ break
61
+
62
+ parts = line.strip().split()
63
+ if len(parts) >= 3:
64
+ try:
65
+ site = parts[0]
66
+ aa = parts[1]
67
+ prob = float(parts[-1].replace("*", ""))
68
+ if prob >= 0.95:
69
+ beb_sites.append((site, aa, prob))
70
+ except:
71
+ continue
72
+
73
+ return beb_sites
74
+
75
+
76
+ def write_ctl(path, seqfile, treefile, outfile, fix_omega):
77
+ """
78
+ Write branch-site control file
79
+ """
80
+ with open(path, "w") as ctl:
81
+ ctl.write(f"""
82
+ seqfile = {seqfile}
83
+ treefile = {treefile}
84
+ outfile = {outfile}
85
+
86
+ noisy = 3
87
+ verbose = 1
88
+ runmode = 0
89
+
90
+ seqtype = 1
91
+ CodonFreq = 7
92
+ clock = 0
93
+
94
+ model = 2
95
+ NSsites = 2
96
+
97
+ icode = 0
98
+ fix_kappa = 0
99
+ kappa = 2
100
+
101
+ fix_omega = {fix_omega}
102
+ omega = 1.5
103
+
104
+ fix_alpha = 1
105
+ alpha = 0
106
+ Malpha = 0
107
+
108
+ ncatG = 4
109
+ getSE = 0
110
+ RateAncestor = 0
111
+ Small_Diff = 1e-6
112
+ """)
113
+
114
+
115
+ ############################################################
116
+ # Read foreground branches
117
+ ############################################################
118
+
119
+ with open(branches_file) as f:
120
+ branches = [line.strip() for line in f if line.strip()]
121
+
122
+ print(f"[INFO] Running codeml for {len(branches)} foreground branches")
123
+
124
+ summary_lines = []
125
+ summary_lines.append("Branch\tlnL_null\tlnL_alt\tLRT\tpvalue\tSignificant\tBEB_sites")
126
+
127
+ ############################################################
128
+ # Run codeml per branch
129
+ ############################################################
130
+
131
+ for branch in branches:
132
+
133
+ print(f"\n[INFO] Processing branch: {branch}")
134
+
135
+ workdir = results_dir / f"codeml_{branch}"
136
+ workdir.mkdir(exist_ok=True)
137
+
138
+ # Create branch-labeled tree
139
+ tree_text = open(tree_file).read()
140
+ labeled_tree = tree_text.replace(branch, branch + " #1")
141
+ labeled_tree_file = workdir / "tree.nwk"
142
+ labeled_tree_file.write_text(labeled_tree)
143
+
144
+ # Copy alignment
145
+ aln_copy = workdir / "alignment.phy"
146
+ subprocess.run(["cp", str(alignment), str(aln_copy)], check=True)
147
+
148
+ # Write control files
149
+ alt_ctl = workdir / "alt.ctl"
150
+ null_ctl = workdir / "null.ctl"
151
+
152
+ write_ctl(alt_ctl, aln_copy, labeled_tree_file, "alt.mlc", fix_omega=0)
153
+ write_ctl(null_ctl, aln_copy, labeled_tree_file, "null.mlc", fix_omega=1)
154
+
155
+ try:
156
+ run_cmd(["codeml", str(null_ctl)], cwd=workdir)
157
+ run_cmd(["codeml", str(alt_ctl)], cwd=workdir)
158
+ except subprocess.CalledProcessError:
159
+ print(f"[WARNING] codeml failed for branch {branch}")
160
+ continue
161
+
162
+ # Extract lnL
163
+ lnL_null = extract_lnL(workdir / "null.mlc")
164
+ lnL_alt = extract_lnL(workdir / "alt.mlc")
165
+
166
+ # Compute LRT
167
+ LRT = 2 * (lnL_alt - lnL_null)
168
+
169
+ # Mixture chi-square 0.5*χ²₁
170
+ pvalue = 0.5 * (1 - chi2.cdf(LRT, 1))
171
+
172
+ significant = "YES" if pvalue <= 0.05 else "NO"
173
+
174
+ beb_sites = extract_beb_sites(workdir / "alt.mlc")
175
+
176
+ beb_string = ";".join([f"{s}:{a}:{p:.3f}" for s,a,p in beb_sites])
177
+
178
+ summary_lines.append(
179
+ f"{branch}\t{lnL_null:.6f}\t{lnL_alt:.6f}\t{LRT:.4f}\t{pvalue:.6f}\t{significant}\t{beb_string}"
180
+ )
181
+
182
+ ############################################################
183
+ # Write summary
184
+ ############################################################
185
+
186
+ summary_file = results_dir / "codeml_branch_site_summary.tsv"
187
+
188
+ with open(summary_file, "w") as f:
189
+ f.write("\n".join(summary_lines))
190
+
191
+ print(f"\n[INFO] Summary written to {summary_file}")
192
+
193
+ # Touch done flag
194
+ output_flag.touch()
195
+
196
+ print("\n✅ Dynamic codeml complete.\n")
@@ -0,0 +1,263 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import subprocess
4
+ from pathlib import Path
5
+ from Bio import SeqIO
6
+ from Bio.SeqRecord import SeqRecord
7
+
8
+ ############################################################
9
+ # Utility
10
+ ############################################################
11
+
12
+ def run_cmd(cmd, cwd=None):
13
+ print("[RUN]", " ".join(cmd))
14
+ subprocess.run(cmd, check=True, cwd=cwd)
15
+
16
+
17
+ def make_blast_db(fasta, dbtype, db_dir):
18
+ fasta = Path(fasta).resolve()
19
+ db_dir = Path(db_dir).resolve()
20
+ db_dir.mkdir(parents=True, exist_ok=True)
21
+
22
+ prefix = db_dir / fasta.stem
23
+ index_file = prefix.with_suffix(".nin" if dbtype == "nucl" else ".pin")
24
+
25
+ if not index_file.exists():
26
+ run_cmd([
27
+ "makeblastdb",
28
+ "-in", str(fasta),
29
+ "-dbtype", dbtype,
30
+ "-out", str(prefix)
31
+ ])
32
+
33
+ return str(prefix)
34
+
35
+
36
+ ############################################################
37
+ # Snakemake Inputs
38
+ ############################################################
39
+
40
+ query_file = Path(snakemake.input.query).resolve()
41
+ proteome_dir = Path(snakemake.input.proteomes).resolve()
42
+ cds_dir = Path(snakemake.input.cds).resolve()
43
+
44
+ protein_output = Path(snakemake.output.proteins).resolve()
45
+ cds_output = Path(snakemake.output.cds).resolve()
46
+
47
+ threads = snakemake.threads
48
+
49
+ min_cov = 0.7
50
+ evalue_cutoff = 1e-5
51
+ identity_threshold = 99.0
52
+
53
+ results_dir = protein_output.parent
54
+ blast_db_dir = results_dir / "blastdb"
55
+
56
+ results_dir.mkdir(parents=True, exist_ok=True)
57
+
58
+ ############################################################
59
+ # RBH STAGE
60
+ ############################################################
61
+
62
+ print("\n=== RBH STAGE ===")
63
+
64
+ query_record = next(SeqIO.parse(query_file, "fasta"))
65
+ query_id = query_record.id
66
+
67
+ query_db = make_blast_db(query_file, "prot", blast_db_dir)
68
+
69
+ orthogroup_proteins = []
70
+
71
+ for species_path in proteome_dir.glob("*"):
72
+
73
+ if species_path.suffix not in [".faa", ".fa", ".fasta"]:
74
+ continue
75
+
76
+ species_path = species_path.resolve()
77
+ species_name = species_path.stem
78
+
79
+ print(f"[INFO] Processing {species_name}")
80
+
81
+ species_db = make_blast_db(species_path, "prot", blast_db_dir)
82
+
83
+ fwd_out = results_dir / f"{species_name}_fwd.tsv"
84
+ rev_out = results_dir / f"{species_name}_rev.tsv"
85
+
86
+ run_cmd([
87
+ "blastp",
88
+ "-query", str(query_file),
89
+ "-db", species_db,
90
+ "-out", str(fwd_out),
91
+ "-evalue", str(evalue_cutoff),
92
+ "-outfmt", "6 qseqid sseqid pident length qlen slen bitscore",
93
+ "-num_threads", str(threads)
94
+ ])
95
+
96
+ run_cmd([
97
+ "blastp",
98
+ "-query", str(species_path),
99
+ "-db", query_db,
100
+ "-out", str(rev_out),
101
+ "-evalue", str(evalue_cutoff),
102
+ "-outfmt", "6 qseqid sseqid pident length qlen slen bitscore",
103
+ "-num_threads", str(threads)
104
+ ])
105
+
106
+ def get_best_hits(blast_file):
107
+ best = {}
108
+ with open(blast_file) as f:
109
+ for line in f:
110
+ q, s, pident, length, qlen, slen, bitscore = line.strip().split()
111
+ length, qlen, slen = int(length), int(qlen), int(slen)
112
+ bitscore = float(bitscore)
113
+
114
+ if (length/qlen) >= min_cov and (length/slen) >= min_cov:
115
+ if q not in best or bitscore > best[q][1]:
116
+ best[q] = (s, bitscore)
117
+ return {q: s for q, (s, _) in best.items()}
118
+
119
+ fwd_hits = get_best_hits(fwd_out)
120
+ rev_hits = get_best_hits(rev_out)
121
+
122
+ rbh = {q: s for q, s in fwd_hits.items()
123
+ if s in rev_hits and rev_hits[s] == q}
124
+
125
+ if query_id not in rbh:
126
+ print(f"[WARN] No RBH for {species_name}")
127
+ continue
128
+
129
+ ortholog_id = rbh[query_id]
130
+
131
+ for record in SeqIO.parse(species_path, "fasta"):
132
+ if record.id == ortholog_id:
133
+ orthogroup_proteins.append(record)
134
+ break
135
+
136
+ SeqIO.write(orthogroup_proteins, protein_output, "fasta")
137
+ print(f"[INFO] Protein orthogroup written: {protein_output}")
138
+
139
+ ############################################################
140
+ # MERGE TRANSCRIPTS INTO RESULTS
141
+ ############################################################
142
+
143
+ print("\n=== MERGING TRANSCRIPTS ===")
144
+
145
+ merged_transcripts = results_dir / "all_transcripts.fasta"
146
+
147
+ with open(merged_transcripts, "w") as out:
148
+ for file in cds_dir.glob("*"):
149
+ if file.suffix in [".fa", ".fasta", ".fna"]:
150
+ for record in SeqIO.parse(file, "fasta"):
151
+ SeqIO.write(record, out, "fasta")
152
+
153
+ ############################################################
154
+ # TRANSDECODER
155
+ ############################################################
156
+
157
+ print("\n=== RUNNING TRANSDECODER ===")
158
+
159
+ run_cmd(
160
+ ["TransDecoder.LongOrfs", "-t", str(merged_transcripts)],
161
+ cwd=str(results_dir)
162
+ )
163
+
164
+ run_cmd(
165
+ ["TransDecoder.Predict", "-t", str(merged_transcripts), "--cpu", str(threads)],
166
+ cwd=str(results_dir)
167
+ )
168
+
169
+ predicted_cds = merged_transcripts.with_suffix(
170
+ merged_transcripts.suffix + ".transdecoder.cds"
171
+ )
172
+
173
+ if not predicted_cds.exists():
174
+ raise RuntimeError(f"TransDecoder CDS file not found: {predicted_cds}")
175
+
176
+ print(f"[INFO] Predicted CDS: {predicted_cds}")
177
+
178
+ ############################################################
179
+ # BUILD BLAST DB
180
+ ############################################################
181
+
182
+ cds_db = make_blast_db(predicted_cds, "nucl", blast_db_dir)
183
+
184
+ transcript_index = SeqIO.to_dict(
185
+ SeqIO.parse(predicted_cds, "fasta")
186
+ )
187
+
188
+ ############################################################
189
+ # TBLASTN RECOVERY
190
+ ############################################################
191
+
192
+ print("\n=== TBLASTN RECOVERY ===")
193
+
194
+ recovered_cds = []
195
+
196
+ for protein in orthogroup_proteins:
197
+
198
+ temp_query = results_dir / "temp_query.fasta"
199
+ SeqIO.write(protein, temp_query, "fasta")
200
+
201
+ blast_out = results_dir / "tblastn.tsv"
202
+
203
+ run_cmd([
204
+ "tblastn",
205
+ "-query", str(temp_query),
206
+ "-db", cds_db,
207
+ "-out", str(blast_out),
208
+ "-outfmt", "6 qseqid sseqid pident length qlen",
209
+ "-num_threads", str(threads)
210
+ ])
211
+
212
+ with open(blast_out) as f:
213
+ for line in f:
214
+ q, s, pident, length, qlen = line.strip().split()
215
+
216
+ if float(pident) >= identity_threshold and int(length) == int(qlen):
217
+ cds_seq = transcript_index[s].seq
218
+
219
+ # Remove terminal stop codon if present
220
+ if cds_seq[-3:].upper() in ["TAA", "TAG", "TGA"]:
221
+ cds_seq = cds_seq[:-3]
222
+
223
+ recovered_cds.append(
224
+ SeqRecord(
225
+ cds_seq,
226
+ id=protein.id,
227
+ description=""
228
+ )
229
+ )
230
+
231
+ break
232
+
233
+ SeqIO.write(recovered_cds, cds_output, "fasta")
234
+ print(f"[INFO] CDS orthogroup written: {cds_output}")
235
+
236
+ ############################################################
237
+ # CLEANUP EVERYTHING EXCEPT FINAL OUTPUTS
238
+ ############################################################
239
+
240
+ print("\n=== CLEANING INTERMEDIATES ===")
241
+
242
+ keep = {protein_output.resolve(), cds_output.resolve()}
243
+
244
+ for item in results_dir.iterdir():
245
+ if item.resolve() in keep:
246
+ continue
247
+ if item.is_dir():
248
+ for sub in item.rglob("*"):
249
+ try:
250
+ sub.unlink()
251
+ except:
252
+ pass
253
+ try:
254
+ item.rmdir()
255
+ except:
256
+ pass
257
+ else:
258
+ try:
259
+ item.unlink()
260
+ except:
261
+ pass
262
+
263
+ print("\n✅ Orthogroup module complete.\n")
@@ -0,0 +1,68 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from Bio import Phylo
6
+
7
+ ############################################################
8
+ # Snakemake inputs
9
+ ############################################################
10
+
11
+ absrel_json = Path(snakemake.input.json)
12
+ tree_file = Path(snakemake.input.tree)
13
+ output_file = Path(snakemake.output.branches)
14
+
15
+ ############################################################
16
+ # Load absrel results
17
+ ############################################################
18
+
19
+ with open(absrel_json) as f:
20
+ data = json.load(f)
21
+
22
+ selected_branches = []
23
+
24
+ # HyPhy structure
25
+ branch_data = data.get("branch attributes", {}).get("0", {})
26
+
27
+ for branch, info in branch_data.items():
28
+
29
+ # Try multiple possible keys
30
+ pval = (
31
+ info.get("Corrected P-value")
32
+ or info.get("p-value")
33
+ or info.get("p-value (corrected)")
34
+ )
35
+
36
+ if pval is None:
37
+ continue
38
+
39
+ try:
40
+ if float(pval) <= 0.05:
41
+ selected_branches.append(branch)
42
+ except:
43
+ continue
44
+
45
+ ############################################################
46
+ # If none selected → fallback to all terminal branches
47
+ ############################################################
48
+
49
+ if not selected_branches:
50
+
51
+ print("[INFO] No significant branches from absrel.")
52
+ print("[INFO] Falling back to all terminal branches.")
53
+
54
+ tree = Phylo.read(tree_file, "newick")
55
+
56
+ for clade in tree.get_terminals():
57
+ if clade.name:
58
+ selected_branches.append(clade.name)
59
+
60
+ ############################################################
61
+ # Write output
62
+ ############################################################
63
+
64
+ with open(output_file, "w") as out:
65
+ for branch in selected_branches:
66
+ out.write(branch + "\n")
67
+
68
+ print(f"[INFO] Foreground branches: {len(selected_branches)}")
@@ -0,0 +1,10 @@
1
+ Metadata-Version: 2.4
2
+ Name: babappa
3
+ Version: 0.1.0
4
+ Summary: Evolutionary selection pipeline integrating RBH, codon alignment, HyPhy, PAML, and ancestral reconstruction.
5
+ Author-email: Krishnendu Sinha <dr.krishnendusinha@gmail.com>
6
+ License: MIT
7
+ Requires-Python: >=3.9
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: biopython
10
+ Requires-Dist: snakemake
@@ -0,0 +1,15 @@
1
+ README.md
2
+ pyproject.toml
3
+ babappa/__init__.py
4
+ babappa/cli.py
5
+ babappa.egg-info/PKG-INFO
6
+ babappa.egg-info/SOURCES.txt
7
+ babappa.egg-info/dependency_links.txt
8
+ babappa.egg-info/entry_points.txt
9
+ babappa.egg-info/requires.txt
10
+ babappa.egg-info/top_level.txt
11
+ babappa/workflow/scripts/ancestral_reconstruction.py
12
+ babappa/workflow/scripts/codon_align.py
13
+ babappa/workflow/scripts/dynamic_codeml.py
14
+ babappa/workflow/scripts/orthogroup.py
15
+ babappa/workflow/scripts/parse_absrel.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ babappa = babappa.cli:main
@@ -0,0 +1,2 @@
1
+ biopython
2
+ snakemake
@@ -0,0 +1 @@
1
+ babappa
@@ -0,0 +1,21 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "babappa"
7
+ version = "0.1.0"
8
+ description = "Evolutionary selection pipeline integrating RBH, codon alignment, HyPhy, PAML, and ancestral reconstruction."
9
+ authors = [
10
+ {name = "Krishnendu Sinha", email = "dr.krishnendusinha@gmail.com"}
11
+ ]
12
+ readme = "README.md"
13
+ requires-python = ">=3.9"
14
+ license = {text = "MIT"}
15
+ dependencies = [
16
+ "biopython",
17
+ "snakemake"
18
+ ]
19
+
20
+ [project.scripts]
21
+ babappa = "babappa.cli:main"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+