ancify 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ancify/__init__.py +3 -0
- ancify/__main__.py +3 -0
- ancify/ancestral.py +351 -0
- ancify/backend.py +360 -0
- ancify/cli.py +205 -0
- ancify/config.py +188 -0
- ancify/evaluate.py +197 -0
- ancify/ml.py +446 -0
- ancify/parsimony.py +287 -0
- ancify/project.py +163 -0
- ancify/utils.py +79 -0
- ancify-1.3.0.dist-info/METADATA +31 -0
- ancify-1.3.0.dist-info/RECORD +16 -0
- ancify-1.3.0.dist-info/WHEEL +5 -0
- ancify-1.3.0.dist-info/entry_points.txt +2 -0
- ancify-1.3.0.dist-info/top_level.txt +1 -0
ancify/__init__.py
ADDED
ancify/__main__.py
ADDED
ancify/ancestral.py
ADDED
|
@@ -0,0 +1,351 @@
|
|
|
1
|
+
"""Phase 2: Infer ancestral alleles from projected outgroup sequences.
|
|
2
|
+
|
|
3
|
+
Uses a two-tier outgroup voting scheme:
|
|
4
|
+
|
|
5
|
+
* **Inner outgroup** -- closely related species; the most frequent
|
|
6
|
+
nucleotide among them forms the inner consensus.
|
|
7
|
+
* **Outer outgroup** -- more distantly related species; serves as an
|
|
8
|
+
independent confirmation.
|
|
9
|
+
|
|
10
|
+
Confidence is encoded via letter case in the output FASTA:
|
|
11
|
+
|
|
12
|
+
======== =========== ==========================================
|
|
13
|
+
Char Confidence Condition
|
|
14
|
+
======== =========== ==========================================
|
|
15
|
+
``ACGT`` High Inner and outer outgroups agree
|
|
16
|
+
``acgt`` Low Only one tier has data
|
|
17
|
+
``n`` Unresolved Inner and outer disagree
|
|
18
|
+
``N`` Missing Both tiers lack data
|
|
19
|
+
======== =========== ==========================================
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
import logging
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
|
|
25
|
+
|
|
26
|
+
from .utils import read_fasta, write_fasta, majority_vote, VALID_ALLELES
|
|
27
|
+
from .backend import vectorized_ancestral_call, get_available_gpus
|
|
28
|
+
from .parsimony import (
|
|
29
|
+
VALID_BASES,
|
|
30
|
+
fitch_ancestral,
|
|
31
|
+
get_leaf_names,
|
|
32
|
+
parse_newick,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
logger = logging.getLogger(__name__)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def call_ancestral_base(inner_bases, outer_bases,
|
|
39
|
+
min_inner_freq=1, min_outer_freq=1):
|
|
40
|
+
"""Infer the ancestral allele at a single position.
|
|
41
|
+
|
|
42
|
+
Parameters
|
|
43
|
+
----------
|
|
44
|
+
inner_bases : list of str
|
|
45
|
+
Nucleotides from the inner (closely related) outgroup species.
|
|
46
|
+
outer_bases : list of str
|
|
47
|
+
Nucleotides from the outer (distantly related) outgroup species.
|
|
48
|
+
min_inner_freq, min_outer_freq : int
|
|
49
|
+
Minimum allele count to accept a majority-vote consensus.
|
|
50
|
+
|
|
51
|
+
Returns
|
|
52
|
+
-------
|
|
53
|
+
str
|
|
54
|
+
Single character with case-encoded confidence (see module docstring).
|
|
55
|
+
"""
|
|
56
|
+
inner = majority_vote(inner_bases, min_inner_freq)
|
|
57
|
+
outer = majority_vote(outer_bases, min_outer_freq)
|
|
58
|
+
|
|
59
|
+
if inner != "N" and inner == outer:
|
|
60
|
+
return inner
|
|
61
|
+
if inner == "N" and outer != "N":
|
|
62
|
+
return outer.lower()
|
|
63
|
+
if inner != "N" and outer == "N":
|
|
64
|
+
return inner.lower()
|
|
65
|
+
if inner == "N" and outer == "N":
|
|
66
|
+
return "N"
|
|
67
|
+
return "n"
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _call_chromosome(args):
|
|
71
|
+
"""Worker: call ancestral states for one chromosome."""
|
|
72
|
+
chrom, inner_paths, outer_paths, out_path, min_inner, min_outer = args
|
|
73
|
+
|
|
74
|
+
inner_seqs = [read_fasta(p)[1] for p in inner_paths]
|
|
75
|
+
outer_seqs = [read_fasta(p)[1] for p in outer_paths]
|
|
76
|
+
|
|
77
|
+
length = len(inner_seqs[0])
|
|
78
|
+
for s in inner_seqs + outer_seqs:
|
|
79
|
+
if len(s) != length:
|
|
80
|
+
raise ValueError(
|
|
81
|
+
f"Length mismatch on {chrom}: expected {length}, got {len(s)}"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
anc = []
|
|
85
|
+
for i in range(length):
|
|
86
|
+
ib = [s[i].upper() for s in inner_seqs]
|
|
87
|
+
ob = [s[i].upper() for s in outer_seqs]
|
|
88
|
+
anc.append(call_ancestral_base(ib, ob, min_inner, min_outer))
|
|
89
|
+
|
|
90
|
+
write_fasta(out_path, f">{chrom}", "".join(anc))
|
|
91
|
+
return chrom
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _call_chromosome_vectorized(args):
|
|
95
|
+
"""Vectorized worker: call ancestral states for one chromosome.
|
|
96
|
+
|
|
97
|
+
Uses bulk NumPy (CPU) or PyTorch (GPU) operations instead of a
|
|
98
|
+
per-position Python loop. *device_str* is ``None`` for the CPU path
|
|
99
|
+
or a CUDA device string like ``"cuda:0"`` for the GPU path.
|
|
100
|
+
"""
|
|
101
|
+
chrom, inner_paths, outer_paths, out_path, min_inner, min_outer, device_str = args
|
|
102
|
+
|
|
103
|
+
device = None
|
|
104
|
+
if device_str is not None:
|
|
105
|
+
import torch
|
|
106
|
+
device = torch.device(device_str)
|
|
107
|
+
|
|
108
|
+
inner_seqs = [read_fasta(p)[1] for p in inner_paths]
|
|
109
|
+
outer_seqs = [read_fasta(p)[1] for p in outer_paths]
|
|
110
|
+
|
|
111
|
+
length = len(inner_seqs[0])
|
|
112
|
+
for s in inner_seqs + outer_seqs:
|
|
113
|
+
if len(s) != length:
|
|
114
|
+
raise ValueError(
|
|
115
|
+
f"Length mismatch on {chrom}: expected {length}, got {len(s)}"
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
anc = vectorized_ancestral_call(
|
|
119
|
+
inner_seqs, outer_seqs, min_inner, min_outer, device,
|
|
120
|
+
)
|
|
121
|
+
write_fasta(out_path, f">{chrom}", anc)
|
|
122
|
+
return chrom
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def call_ancestral_base_parsimony(tree, species_bases):
|
|
126
|
+
"""Infer the ancestral allele at a single position using Fitch parsimony.
|
|
127
|
+
|
|
128
|
+
Parameters
|
|
129
|
+
----------
|
|
130
|
+
tree : TreeNode
|
|
131
|
+
Phylogenetic tree of outgroup species.
|
|
132
|
+
species_bases : dict
|
|
133
|
+
Mapping of species name → observed nucleotide (single character).
|
|
134
|
+
|
|
135
|
+
Returns
|
|
136
|
+
-------
|
|
137
|
+
str
|
|
138
|
+
Single character with case-encoded confidence:
|
|
139
|
+
uppercase = unambiguous, lowercase = ambiguous, ``N`` = all missing.
|
|
140
|
+
"""
|
|
141
|
+
leaf_names = get_leaf_names(tree)
|
|
142
|
+
has_data = any(
|
|
143
|
+
species_bases.get(name, "N").upper() in VALID_BASES
|
|
144
|
+
for name in leaf_names
|
|
145
|
+
)
|
|
146
|
+
if not has_data:
|
|
147
|
+
return "N"
|
|
148
|
+
|
|
149
|
+
allele, is_ambiguous = fitch_ancestral(tree, species_bases)
|
|
150
|
+
if is_ambiguous:
|
|
151
|
+
return allele.lower()
|
|
152
|
+
return allele.upper()
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _call_chromosome_parsimony(args):
|
|
156
|
+
"""Worker: call ancestral states for one chromosome using Fitch parsimony."""
|
|
157
|
+
chrom, species_paths, tree_text, out_path = args
|
|
158
|
+
|
|
159
|
+
tree = parse_newick(tree_text)
|
|
160
|
+
species_seqs = {}
|
|
161
|
+
length = None
|
|
162
|
+
for name, path in species_paths.items():
|
|
163
|
+
_, seq = read_fasta(path)
|
|
164
|
+
species_seqs[name] = seq
|
|
165
|
+
if length is None:
|
|
166
|
+
length = len(seq)
|
|
167
|
+
elif len(seq) != length:
|
|
168
|
+
raise ValueError(
|
|
169
|
+
f"Length mismatch on {chrom}: expected {length}, got {len(seq)}"
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
anc = []
|
|
173
|
+
for i in range(length):
|
|
174
|
+
bases = {name: seq[i].upper() for name, seq in species_seqs.items()}
|
|
175
|
+
anc.append(call_ancestral_base_parsimony(tree, bases))
|
|
176
|
+
|
|
177
|
+
write_fasta(out_path, f">{chrom}", "".join(anc))
|
|
178
|
+
return chrom
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def run_ancestral_calling(config):
|
|
182
|
+
"""Execute Phase 2: call ancestral alleles for every chromosome.
|
|
183
|
+
|
|
184
|
+
Reads projected FASTA files from ``<work_dir>/projected/<species>/``
|
|
185
|
+
and writes ancestral FASTA files to ``<output_dir>/<chrom>.fa``.
|
|
186
|
+
|
|
187
|
+
Supported methods: ``"voting"`` (default), ``"parsimony"``, ``"ml"``.
|
|
188
|
+
"""
|
|
189
|
+
method = getattr(config, "method", "voting")
|
|
190
|
+
|
|
191
|
+
if method == "parsimony":
|
|
192
|
+
return _run_parsimony(config)
|
|
193
|
+
if method == "ml":
|
|
194
|
+
return _run_ml(config)
|
|
195
|
+
|
|
196
|
+
_run_voting(config)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def _run_parsimony(config):
|
|
200
|
+
"""Parsimony-based ancestral calling for all chromosomes."""
|
|
201
|
+
chromosomes = config.resolve_chromosomes()
|
|
202
|
+
work = Path(config.work_dir)
|
|
203
|
+
out_dir = Path(config.output_dir)
|
|
204
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
205
|
+
|
|
206
|
+
all_outgroups = config.outgroups_inner + config.outgroups_outer
|
|
207
|
+
|
|
208
|
+
tasks = []
|
|
209
|
+
for chrom in chromosomes:
|
|
210
|
+
species_paths = {
|
|
211
|
+
og.name: str(work / "projected" / og.name / f"{chrom}.fa")
|
|
212
|
+
for og in all_outgroups
|
|
213
|
+
}
|
|
214
|
+
out_path = str(out_dir / f"{chrom}.fa")
|
|
215
|
+
tasks.append((chrom, species_paths, config.tree, out_path))
|
|
216
|
+
|
|
217
|
+
logger.info(
|
|
218
|
+
"Phase 2: calling ancestral states for %d chromosomes [parsimony]",
|
|
219
|
+
len(tasks),
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
with ProcessPoolExecutor(max_workers=config.num_cpus) as pool:
|
|
223
|
+
futures = {
|
|
224
|
+
pool.submit(_call_chromosome_parsimony, t): t for t in tasks
|
|
225
|
+
}
|
|
226
|
+
for future in as_completed(futures):
|
|
227
|
+
chrom = future.result()
|
|
228
|
+
logger.info(" Completed %s", chrom)
|
|
229
|
+
|
|
230
|
+
logger.info("Phase 2 complete.")
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def _run_ml(config):
|
|
234
|
+
"""ML-based ancestral calling for all chromosomes."""
|
|
235
|
+
from .ml import _call_chromosome_ml
|
|
236
|
+
|
|
237
|
+
chromosomes = config.resolve_chromosomes()
|
|
238
|
+
work = Path(config.work_dir)
|
|
239
|
+
out_dir = Path(config.output_dir)
|
|
240
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
241
|
+
|
|
242
|
+
model_path = config.ml_model_path
|
|
243
|
+
if not model_path:
|
|
244
|
+
raise ValueError(
|
|
245
|
+
"method 'ml' requires 'ml_model_path' pointing to a trained model. "
|
|
246
|
+
"Run 'ancify train' first."
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
tasks = []
|
|
250
|
+
for chrom in chromosomes:
|
|
251
|
+
inner_paths = [
|
|
252
|
+
str(work / "projected" / og.name / f"{chrom}.fa")
|
|
253
|
+
for og in config.outgroups_inner
|
|
254
|
+
]
|
|
255
|
+
outer_paths = [
|
|
256
|
+
str(work / "projected" / og.name / f"{chrom}.fa")
|
|
257
|
+
for og in config.outgroups_outer
|
|
258
|
+
]
|
|
259
|
+
out_path = str(out_dir / f"{chrom}.fa")
|
|
260
|
+
tasks.append((
|
|
261
|
+
chrom, inner_paths, outer_paths, out_path,
|
|
262
|
+
model_path, config.ml_high_threshold, config.ml_low_threshold,
|
|
263
|
+
))
|
|
264
|
+
|
|
265
|
+
logger.info(
|
|
266
|
+
"Phase 2: calling ancestral states for %d chromosomes [ml]",
|
|
267
|
+
len(tasks),
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
with ProcessPoolExecutor(max_workers=config.num_cpus) as pool:
|
|
271
|
+
futures = {
|
|
272
|
+
pool.submit(_call_chromosome_ml, t): t for t in tasks
|
|
273
|
+
}
|
|
274
|
+
for future in as_completed(futures):
|
|
275
|
+
chrom = future.result()
|
|
276
|
+
logger.info(" Completed %s", chrom)
|
|
277
|
+
|
|
278
|
+
logger.info("Phase 2 complete.")
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def _run_voting(config):
|
|
282
|
+
"""Two-tier voting ancestral calling (original algorithm)."""
|
|
283
|
+
chromosomes = config.resolve_chromosomes()
|
|
284
|
+
work = Path(config.work_dir)
|
|
285
|
+
out_dir = Path(config.output_dir)
|
|
286
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
287
|
+
|
|
288
|
+
# ── resolve backend and GPU devices ──────────────────────────────
|
|
289
|
+
backend_mode = getattr(config, "backend", "auto")
|
|
290
|
+
gpu_device_ids = getattr(config, "gpu_devices", None)
|
|
291
|
+
|
|
292
|
+
use_gpu = False
|
|
293
|
+
devices = []
|
|
294
|
+
|
|
295
|
+
if backend_mode in ("auto", "gpu"):
|
|
296
|
+
gpus = get_available_gpus()
|
|
297
|
+
if gpu_device_ids is not None:
|
|
298
|
+
gpus = [g for g in gpus if g in gpu_device_ids]
|
|
299
|
+
if gpus:
|
|
300
|
+
import torch # noqa: F811 – guarded import
|
|
301
|
+
devices = [torch.device(f"cuda:{i}") for i in gpus]
|
|
302
|
+
use_gpu = True
|
|
303
|
+
|
|
304
|
+
if backend_mode == "gpu" and not use_gpu:
|
|
305
|
+
logger.warning(
|
|
306
|
+
"GPU backend requested but no CUDA devices available; "
|
|
307
|
+
"falling back to CPU vectorised path"
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
# ── build task list ──────────────────────────────────────────────
|
|
311
|
+
tasks = []
|
|
312
|
+
for i, chrom in enumerate(chromosomes):
|
|
313
|
+
inner_paths = [
|
|
314
|
+
str(work / "projected" / og.name / f"{chrom}.fa")
|
|
315
|
+
for og in config.outgroups_inner
|
|
316
|
+
]
|
|
317
|
+
outer_paths = [
|
|
318
|
+
str(work / "projected" / og.name / f"{chrom}.fa")
|
|
319
|
+
for og in config.outgroups_outer
|
|
320
|
+
]
|
|
321
|
+
out_path = str(out_dir / f"{chrom}.fa")
|
|
322
|
+
device_str = str(devices[i % len(devices)]) if use_gpu else None
|
|
323
|
+
tasks.append((
|
|
324
|
+
chrom, inner_paths, outer_paths, out_path,
|
|
325
|
+
config.min_inner_freq, config.min_outer_freq, device_str,
|
|
326
|
+
))
|
|
327
|
+
|
|
328
|
+
label = f"gpu ({len(devices)} device(s))" if use_gpu else "cpu"
|
|
329
|
+
logger.info(
|
|
330
|
+
"Phase 2: calling ancestral states for %d chromosomes [%s]",
|
|
331
|
+
len(tasks), label,
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
# ── dispatch ─────────────────────────────────────────────────────
|
|
335
|
+
if use_gpu:
|
|
336
|
+
# GPU ops release the GIL; threads overlap I/O and compute.
|
|
337
|
+
max_workers = min(len(tasks), len(devices) * 2) or 1
|
|
338
|
+
pool_cls = ThreadPoolExecutor
|
|
339
|
+
else:
|
|
340
|
+
max_workers = config.num_cpus
|
|
341
|
+
pool_cls = ProcessPoolExecutor
|
|
342
|
+
|
|
343
|
+
with pool_cls(max_workers=max_workers) as pool:
|
|
344
|
+
futures = {
|
|
345
|
+
pool.submit(_call_chromosome_vectorized, t): t for t in tasks
|
|
346
|
+
}
|
|
347
|
+
for future in as_completed(futures):
|
|
348
|
+
chrom = future.result()
|
|
349
|
+
logger.info(" Completed %s", chrom)
|
|
350
|
+
|
|
351
|
+
logger.info("Phase 2 complete.")
|