ancify 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ancify/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """ancify -- Ancestral allele polarization pipeline using outgroup species."""
2
+
3
+ __version__ = "1.0.0"
ancify/__main__.py ADDED
@@ -0,0 +1,3 @@
1
+ from .cli import main
2
+
3
+ main()
ancify/ancestral.py ADDED
@@ -0,0 +1,351 @@
1
+ """Phase 2: Infer ancestral alleles from projected outgroup sequences.
2
+
3
+ Uses a two-tier outgroup voting scheme:
4
+
5
+ * **Inner outgroup** -- closely related species; the most frequent
6
+ nucleotide among them forms the inner consensus.
7
+ * **Outer outgroup** -- more distantly related species; serves as an
8
+ independent confirmation.
9
+
10
+ Confidence is encoded via letter case in the output FASTA:
11
+
12
+ ======== =========== ==========================================
13
+ Char Confidence Condition
14
+ ======== =========== ==========================================
15
+ ``ACGT`` High Inner and outer outgroups agree
16
+ ``acgt`` Low Only one tier has data
17
+ ``n`` Unresolved Inner and outer disagree
18
+ ``N`` Missing Both tiers lack data
19
+ ======== =========== ==========================================
20
+ """
21
+
22
+ import logging
23
+ from pathlib import Path
24
+ from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
25
+
26
+ from .utils import read_fasta, write_fasta, majority_vote, VALID_ALLELES
27
+ from .backend import vectorized_ancestral_call, get_available_gpus
28
+ from .parsimony import (
29
+ VALID_BASES,
30
+ fitch_ancestral,
31
+ get_leaf_names,
32
+ parse_newick,
33
+ )
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+
38
+ def call_ancestral_base(inner_bases, outer_bases,
39
+ min_inner_freq=1, min_outer_freq=1):
40
+ """Infer the ancestral allele at a single position.
41
+
42
+ Parameters
43
+ ----------
44
+ inner_bases : list of str
45
+ Nucleotides from the inner (closely related) outgroup species.
46
+ outer_bases : list of str
47
+ Nucleotides from the outer (distantly related) outgroup species.
48
+ min_inner_freq, min_outer_freq : int
49
+ Minimum allele count to accept a majority-vote consensus.
50
+
51
+ Returns
52
+ -------
53
+ str
54
+ Single character with case-encoded confidence (see module docstring).
55
+ """
56
+ inner = majority_vote(inner_bases, min_inner_freq)
57
+ outer = majority_vote(outer_bases, min_outer_freq)
58
+
59
+ if inner != "N" and inner == outer:
60
+ return inner
61
+ if inner == "N" and outer != "N":
62
+ return outer.lower()
63
+ if inner != "N" and outer == "N":
64
+ return inner.lower()
65
+ if inner == "N" and outer == "N":
66
+ return "N"
67
+ return "n"
68
+
69
+
70
+ def _call_chromosome(args):
71
+ """Worker: call ancestral states for one chromosome."""
72
+ chrom, inner_paths, outer_paths, out_path, min_inner, min_outer = args
73
+
74
+ inner_seqs = [read_fasta(p)[1] for p in inner_paths]
75
+ outer_seqs = [read_fasta(p)[1] for p in outer_paths]
76
+
77
+ length = len(inner_seqs[0])
78
+ for s in inner_seqs + outer_seqs:
79
+ if len(s) != length:
80
+ raise ValueError(
81
+ f"Length mismatch on {chrom}: expected {length}, got {len(s)}"
82
+ )
83
+
84
+ anc = []
85
+ for i in range(length):
86
+ ib = [s[i].upper() for s in inner_seqs]
87
+ ob = [s[i].upper() for s in outer_seqs]
88
+ anc.append(call_ancestral_base(ib, ob, min_inner, min_outer))
89
+
90
+ write_fasta(out_path, f">{chrom}", "".join(anc))
91
+ return chrom
92
+
93
+
94
+ def _call_chromosome_vectorized(args):
95
+ """Vectorized worker: call ancestral states for one chromosome.
96
+
97
+ Uses bulk NumPy (CPU) or PyTorch (GPU) operations instead of a
98
+ per-position Python loop. *device_str* is ``None`` for the CPU path
99
+ or a CUDA device string like ``"cuda:0"`` for the GPU path.
100
+ """
101
+ chrom, inner_paths, outer_paths, out_path, min_inner, min_outer, device_str = args
102
+
103
+ device = None
104
+ if device_str is not None:
105
+ import torch
106
+ device = torch.device(device_str)
107
+
108
+ inner_seqs = [read_fasta(p)[1] for p in inner_paths]
109
+ outer_seqs = [read_fasta(p)[1] for p in outer_paths]
110
+
111
+ length = len(inner_seqs[0])
112
+ for s in inner_seqs + outer_seqs:
113
+ if len(s) != length:
114
+ raise ValueError(
115
+ f"Length mismatch on {chrom}: expected {length}, got {len(s)}"
116
+ )
117
+
118
+ anc = vectorized_ancestral_call(
119
+ inner_seqs, outer_seqs, min_inner, min_outer, device,
120
+ )
121
+ write_fasta(out_path, f">{chrom}", anc)
122
+ return chrom
123
+
124
+
125
+ def call_ancestral_base_parsimony(tree, species_bases):
126
+ """Infer the ancestral allele at a single position using Fitch parsimony.
127
+
128
+ Parameters
129
+ ----------
130
+ tree : TreeNode
131
+ Phylogenetic tree of outgroup species.
132
+ species_bases : dict
133
+ Mapping of species name → observed nucleotide (single character).
134
+
135
+ Returns
136
+ -------
137
+ str
138
+ Single character with case-encoded confidence:
139
+ uppercase = unambiguous, lowercase = ambiguous, ``N`` = all missing.
140
+ """
141
+ leaf_names = get_leaf_names(tree)
142
+ has_data = any(
143
+ species_bases.get(name, "N").upper() in VALID_BASES
144
+ for name in leaf_names
145
+ )
146
+ if not has_data:
147
+ return "N"
148
+
149
+ allele, is_ambiguous = fitch_ancestral(tree, species_bases)
150
+ if is_ambiguous:
151
+ return allele.lower()
152
+ return allele.upper()
153
+
154
+
155
+ def _call_chromosome_parsimony(args):
156
+ """Worker: call ancestral states for one chromosome using Fitch parsimony."""
157
+ chrom, species_paths, tree_text, out_path = args
158
+
159
+ tree = parse_newick(tree_text)
160
+ species_seqs = {}
161
+ length = None
162
+ for name, path in species_paths.items():
163
+ _, seq = read_fasta(path)
164
+ species_seqs[name] = seq
165
+ if length is None:
166
+ length = len(seq)
167
+ elif len(seq) != length:
168
+ raise ValueError(
169
+ f"Length mismatch on {chrom}: expected {length}, got {len(seq)}"
170
+ )
171
+
172
+ anc = []
173
+ for i in range(length):
174
+ bases = {name: seq[i].upper() for name, seq in species_seqs.items()}
175
+ anc.append(call_ancestral_base_parsimony(tree, bases))
176
+
177
+ write_fasta(out_path, f">{chrom}", "".join(anc))
178
+ return chrom
179
+
180
+
181
+ def run_ancestral_calling(config):
182
+ """Execute Phase 2: call ancestral alleles for every chromosome.
183
+
184
+ Reads projected FASTA files from ``<work_dir>/projected/<species>/``
185
+ and writes ancestral FASTA files to ``<output_dir>/<chrom>.fa``.
186
+
187
+ Supported methods: ``"voting"`` (default), ``"parsimony"``, ``"ml"``.
188
+ """
189
+ method = getattr(config, "method", "voting")
190
+
191
+ if method == "parsimony":
192
+ return _run_parsimony(config)
193
+ if method == "ml":
194
+ return _run_ml(config)
195
+
196
+ _run_voting(config)
197
+
198
+
199
+ def _run_parsimony(config):
200
+ """Parsimony-based ancestral calling for all chromosomes."""
201
+ chromosomes = config.resolve_chromosomes()
202
+ work = Path(config.work_dir)
203
+ out_dir = Path(config.output_dir)
204
+ out_dir.mkdir(parents=True, exist_ok=True)
205
+
206
+ all_outgroups = config.outgroups_inner + config.outgroups_outer
207
+
208
+ tasks = []
209
+ for chrom in chromosomes:
210
+ species_paths = {
211
+ og.name: str(work / "projected" / og.name / f"{chrom}.fa")
212
+ for og in all_outgroups
213
+ }
214
+ out_path = str(out_dir / f"{chrom}.fa")
215
+ tasks.append((chrom, species_paths, config.tree, out_path))
216
+
217
+ logger.info(
218
+ "Phase 2: calling ancestral states for %d chromosomes [parsimony]",
219
+ len(tasks),
220
+ )
221
+
222
+ with ProcessPoolExecutor(max_workers=config.num_cpus) as pool:
223
+ futures = {
224
+ pool.submit(_call_chromosome_parsimony, t): t for t in tasks
225
+ }
226
+ for future in as_completed(futures):
227
+ chrom = future.result()
228
+ logger.info(" Completed %s", chrom)
229
+
230
+ logger.info("Phase 2 complete.")
231
+
232
+
233
+ def _run_ml(config):
234
+ """ML-based ancestral calling for all chromosomes."""
235
+ from .ml import _call_chromosome_ml
236
+
237
+ chromosomes = config.resolve_chromosomes()
238
+ work = Path(config.work_dir)
239
+ out_dir = Path(config.output_dir)
240
+ out_dir.mkdir(parents=True, exist_ok=True)
241
+
242
+ model_path = config.ml_model_path
243
+ if not model_path:
244
+ raise ValueError(
245
+ "method 'ml' requires 'ml_model_path' pointing to a trained model. "
246
+ "Run 'ancify train' first."
247
+ )
248
+
249
+ tasks = []
250
+ for chrom in chromosomes:
251
+ inner_paths = [
252
+ str(work / "projected" / og.name / f"{chrom}.fa")
253
+ for og in config.outgroups_inner
254
+ ]
255
+ outer_paths = [
256
+ str(work / "projected" / og.name / f"{chrom}.fa")
257
+ for og in config.outgroups_outer
258
+ ]
259
+ out_path = str(out_dir / f"{chrom}.fa")
260
+ tasks.append((
261
+ chrom, inner_paths, outer_paths, out_path,
262
+ model_path, config.ml_high_threshold, config.ml_low_threshold,
263
+ ))
264
+
265
+ logger.info(
266
+ "Phase 2: calling ancestral states for %d chromosomes [ml]",
267
+ len(tasks),
268
+ )
269
+
270
+ with ProcessPoolExecutor(max_workers=config.num_cpus) as pool:
271
+ futures = {
272
+ pool.submit(_call_chromosome_ml, t): t for t in tasks
273
+ }
274
+ for future in as_completed(futures):
275
+ chrom = future.result()
276
+ logger.info(" Completed %s", chrom)
277
+
278
+ logger.info("Phase 2 complete.")
279
+
280
+
281
+ def _run_voting(config):
282
+ """Two-tier voting ancestral calling (original algorithm)."""
283
+ chromosomes = config.resolve_chromosomes()
284
+ work = Path(config.work_dir)
285
+ out_dir = Path(config.output_dir)
286
+ out_dir.mkdir(parents=True, exist_ok=True)
287
+
288
+ # ── resolve backend and GPU devices ──────────────────────────────
289
+ backend_mode = getattr(config, "backend", "auto")
290
+ gpu_device_ids = getattr(config, "gpu_devices", None)
291
+
292
+ use_gpu = False
293
+ devices = []
294
+
295
+ if backend_mode in ("auto", "gpu"):
296
+ gpus = get_available_gpus()
297
+ if gpu_device_ids is not None:
298
+ gpus = [g for g in gpus if g in gpu_device_ids]
299
+ if gpus:
300
+ import torch # noqa: F811 – guarded import
301
+ devices = [torch.device(f"cuda:{i}") for i in gpus]
302
+ use_gpu = True
303
+
304
+ if backend_mode == "gpu" and not use_gpu:
305
+ logger.warning(
306
+ "GPU backend requested but no CUDA devices available; "
307
+ "falling back to CPU vectorised path"
308
+ )
309
+
310
+ # ── build task list ──────────────────────────────────────────────
311
+ tasks = []
312
+ for i, chrom in enumerate(chromosomes):
313
+ inner_paths = [
314
+ str(work / "projected" / og.name / f"{chrom}.fa")
315
+ for og in config.outgroups_inner
316
+ ]
317
+ outer_paths = [
318
+ str(work / "projected" / og.name / f"{chrom}.fa")
319
+ for og in config.outgroups_outer
320
+ ]
321
+ out_path = str(out_dir / f"{chrom}.fa")
322
+ device_str = str(devices[i % len(devices)]) if use_gpu else None
323
+ tasks.append((
324
+ chrom, inner_paths, outer_paths, out_path,
325
+ config.min_inner_freq, config.min_outer_freq, device_str,
326
+ ))
327
+
328
+ label = f"gpu ({len(devices)} device(s))" if use_gpu else "cpu"
329
+ logger.info(
330
+ "Phase 2: calling ancestral states for %d chromosomes [%s]",
331
+ len(tasks), label,
332
+ )
333
+
334
+ # ── dispatch ─────────────────────────────────────────────────────
335
+ if use_gpu:
336
+ # GPU ops release the GIL; threads overlap I/O and compute.
337
+ max_workers = min(len(tasks), len(devices) * 2) or 1
338
+ pool_cls = ThreadPoolExecutor
339
+ else:
340
+ max_workers = config.num_cpus
341
+ pool_cls = ProcessPoolExecutor
342
+
343
+ with pool_cls(max_workers=max_workers) as pool:
344
+ futures = {
345
+ pool.submit(_call_chromosome_vectorized, t): t for t in tasks
346
+ }
347
+ for future in as_completed(futures):
348
+ chrom = future.result()
349
+ logger.info(" Completed %s", chrom)
350
+
351
+ logger.info("Phase 2 complete.")