phylogenie 2.1.2__py3-none-any.whl → 2.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phylogenie/generators/alisim.py +22 -10
- phylogenie/generators/dataset.py +11 -23
- phylogenie/generators/factories.py +19 -8
- phylogenie/generators/trees.py +25 -11
- phylogenie/treesimulator/gillespie.py +28 -13
- {phylogenie-2.1.2.dist-info → phylogenie-2.1.3.dist-info}/METADATA +1 -1
- {phylogenie-2.1.2.dist-info → phylogenie-2.1.3.dist-info}/RECORD +10 -10
- {phylogenie-2.1.2.dist-info → phylogenie-2.1.3.dist-info}/LICENSE.txt +0 -0
- {phylogenie-2.1.2.dist-info → phylogenie-2.1.3.dist-info}/WHEEL +0 -0
- {phylogenie-2.1.2.dist-info → phylogenie-2.1.3.dist-info}/entry_points.txt +0 -0
phylogenie/generators/alisim.py
CHANGED
|
@@ -3,9 +3,10 @@ import subprocess
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import Any, Literal
|
|
5
5
|
|
|
6
|
-
from numpy.random import Generator
|
|
6
|
+
from numpy.random import Generator, default_rng
|
|
7
7
|
|
|
8
8
|
from phylogenie.generators.dataset import DatasetGenerator, DataType
|
|
9
|
+
from phylogenie.generators.factories import data
|
|
9
10
|
from phylogenie.generators.trees import TreeDatasetGeneratorConfig
|
|
10
11
|
from phylogenie.io import dump_newick
|
|
11
12
|
|
|
@@ -42,9 +43,12 @@ class AliSimDatasetGenerator(DatasetGenerator):
|
|
|
42
43
|
subprocess.run(command, check=True, stdout=subprocess.DEVNULL)
|
|
43
44
|
subprocess.run(["rm", f"{tree_file}.log"], check=True)
|
|
44
45
|
|
|
45
|
-
def
|
|
46
|
-
self,
|
|
47
|
-
|
|
46
|
+
def generate_one(
|
|
47
|
+
self,
|
|
48
|
+
filename: str,
|
|
49
|
+
context: dict[str, Any] | None = None,
|
|
50
|
+
seed: int | None = None,
|
|
51
|
+
) -> dict[str, Any]:
|
|
48
52
|
if self.keep_trees:
|
|
49
53
|
base_dir, file_id = Path(filename).parent, Path(filename).stem
|
|
50
54
|
trees_dir = os.path.join(base_dir, TREES_DIRNAME)
|
|
@@ -57,16 +61,24 @@ class AliSimDatasetGenerator(DatasetGenerator):
|
|
|
57
61
|
tree_filename = f"{filename}.temp-tree"
|
|
58
62
|
msa_filename = filename
|
|
59
63
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
64
|
+
d: dict[str, Any] = {"file_id": Path(msa_filename).stem}
|
|
65
|
+
rng = default_rng(seed)
|
|
66
|
+
while True:
|
|
67
|
+
d.update(data(context, rng))
|
|
68
|
+
try:
|
|
69
|
+
tree = self.trees.simulate_one(d, seed)
|
|
70
|
+
break
|
|
71
|
+
except TimeoutError:
|
|
72
|
+
print(
|
|
73
|
+
"Tree simulation timed out, retrying with different parameters..."
|
|
74
|
+
)
|
|
63
75
|
|
|
64
76
|
for leaf in tree.get_leaves():
|
|
65
77
|
leaf.id += f"|{leaf.get_time()}"
|
|
66
78
|
dump_newick(tree, f"{tree_filename}.nwk")
|
|
67
79
|
|
|
68
|
-
self._generate_one_from_tree(
|
|
69
|
-
filename=msa_filename, tree_file=f"{tree_filename}.nwk", rng=rng, data=data
|
|
70
|
-
)
|
|
80
|
+
self._generate_one_from_tree(msa_filename, f"{tree_filename}.nwk", rng, d)
|
|
71
81
|
if not self.keep_trees:
|
|
72
82
|
os.remove(f"{tree_filename}.nwk")
|
|
83
|
+
|
|
84
|
+
return d
|
phylogenie/generators/dataset.py
CHANGED
|
@@ -1,16 +1,13 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from abc import ABC, abstractmethod
|
|
3
3
|
from enum import Enum
|
|
4
|
-
from itertools import product
|
|
5
4
|
from typing import Any
|
|
6
5
|
|
|
7
6
|
import joblib
|
|
8
|
-
import numpy as np
|
|
9
7
|
import pandas as pd
|
|
10
8
|
from numpy.random import Generator, default_rng
|
|
11
9
|
from tqdm import tqdm
|
|
12
10
|
|
|
13
|
-
from phylogenie.generators.factories import distribution
|
|
14
11
|
from phylogenie.utils import Distribution, StrictBaseModel
|
|
15
12
|
|
|
16
13
|
|
|
@@ -31,15 +28,12 @@ class DatasetGenerator(ABC, StrictBaseModel):
|
|
|
31
28
|
context: dict[str, Distribution] | None = None
|
|
32
29
|
|
|
33
30
|
@abstractmethod
|
|
34
|
-
def _generate_one(
|
|
35
|
-
self, filename: str, rng: Generator, data: dict[str, Any]
|
|
36
|
-
) -> None: ...
|
|
37
|
-
|
|
38
31
|
def generate_one(
|
|
39
|
-
self,
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
32
|
+
self,
|
|
33
|
+
filename: str,
|
|
34
|
+
context: dict[str, Any] | None = None,
|
|
35
|
+
seed: int | None = None,
|
|
36
|
+
) -> dict[str, Any]: ...
|
|
43
37
|
|
|
44
38
|
def _generate(self, rng: Generator, n_samples: int, output_dir: str) -> None:
|
|
45
39
|
if os.path.exists(output_dir):
|
|
@@ -53,24 +47,18 @@ class DatasetGenerator(ABC, StrictBaseModel):
|
|
|
53
47
|
)
|
|
54
48
|
os.makedirs(data_dir)
|
|
55
49
|
|
|
56
|
-
data: list[dict[str, Any]] = [{} for _ in range(n_samples)]
|
|
57
|
-
if self.context is not None:
|
|
58
|
-
for d, (k, v) in product(data, self.context.items()):
|
|
59
|
-
dist = distribution(v, d)
|
|
60
|
-
d[k] = np.array(getattr(rng, dist.type)(**dist.args)).tolist()
|
|
61
|
-
df = pd.DataFrame([{"file_id": str(i), **d} for i, d in enumerate(data)])
|
|
62
|
-
df.to_csv(os.path.join(output_dir, METADATA_FILENAME), index=False)
|
|
63
|
-
|
|
64
50
|
jobs = joblib.Parallel(n_jobs=self.n_jobs, return_as="generator_unordered")(
|
|
65
51
|
joblib.delayed(self.generate_one)(
|
|
66
|
-
filename=os.path.join(data_dir, str(i)),
|
|
67
|
-
data=data[i],
|
|
68
52
|
seed=int(rng.integers(2**32)),
|
|
53
|
+
filename=os.path.join(data_dir, str(i)),
|
|
54
|
+
context=self.context,
|
|
69
55
|
)
|
|
70
56
|
for i in range(n_samples)
|
|
71
57
|
)
|
|
72
|
-
|
|
73
|
-
|
|
58
|
+
df = pd.DataFrame(
|
|
59
|
+
[r for r in tqdm(jobs, total=n_samples, desc=f"Generating {data_dir}...")]
|
|
60
|
+
)
|
|
61
|
+
df.to_csv(os.path.join(output_dir, METADATA_FILENAME), index=False)
|
|
74
62
|
|
|
75
63
|
def generate(self) -> None:
|
|
76
64
|
rng = default_rng(self.seed)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import Any
|
|
2
2
|
|
|
3
3
|
import numpy as np
|
|
4
|
+
from numpy.random import Generator
|
|
4
5
|
|
|
5
6
|
import phylogenie.generators.configs as cfg
|
|
6
7
|
import phylogenie.generators.typeguards as ctg
|
|
@@ -30,14 +31,6 @@ def _eval_expression(expression: str, data: dict[str, Any]) -> Any:
|
|
|
30
31
|
).tolist()
|
|
31
32
|
|
|
32
33
|
|
|
33
|
-
def distribution(x: Distribution, data: dict[str, Any]) -> Distribution:
|
|
34
|
-
args = x.args
|
|
35
|
-
for arg_name, arg_value in args.items():
|
|
36
|
-
if isinstance(arg_value, str):
|
|
37
|
-
args[arg_name] = _eval_expression(arg_value, data)
|
|
38
|
-
return Distribution(type=x.type, **args)
|
|
39
|
-
|
|
40
|
-
|
|
41
34
|
def integer(x: cfg.Integer, data: dict[str, Any]) -> int:
|
|
42
35
|
if isinstance(x, str):
|
|
43
36
|
e = _eval_expression(x, data)
|
|
@@ -209,3 +202,21 @@ def skyline_matrix(
|
|
|
209
202
|
value = [[[e] * N] * M if isinstance(e, pgt.Scalar) else e for e in value]
|
|
210
203
|
|
|
211
204
|
return SkylineMatrix(value=value, change_times=change_times)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def distribution(x: Distribution, data: dict[str, Any]) -> Distribution:
|
|
208
|
+
args = x.args
|
|
209
|
+
for arg_name, arg_value in args.items():
|
|
210
|
+
if isinstance(arg_value, str):
|
|
211
|
+
args[arg_name] = _eval_expression(arg_value, data)
|
|
212
|
+
return Distribution(type=x.type, **args)
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def data(context: dict[str, Distribution] | None, rng: Generator) -> dict[str, Any]:
|
|
216
|
+
if context is None:
|
|
217
|
+
return {}
|
|
218
|
+
data: dict[str, Any] = {}
|
|
219
|
+
for k, v in context.items():
|
|
220
|
+
dist = distribution(v, data)
|
|
221
|
+
data[k] = np.array(getattr(rng, dist.type)(**dist.args)).tolist()
|
|
222
|
+
return data
|
phylogenie/generators/trees.py
CHANGED
|
@@ -1,14 +1,16 @@
|
|
|
1
1
|
from abc import abstractmethod
|
|
2
2
|
from enum import Enum
|
|
3
|
+
from pathlib import Path
|
|
3
4
|
from typing import Annotated, Any, Literal
|
|
4
5
|
|
|
5
6
|
import numpy as np
|
|
6
|
-
from numpy.random import
|
|
7
|
+
from numpy.random import default_rng
|
|
7
8
|
from pydantic import Field
|
|
8
9
|
|
|
9
10
|
import phylogenie.generators.configs as cfg
|
|
10
11
|
from phylogenie.generators.dataset import DatasetGenerator, DataType
|
|
11
12
|
from phylogenie.generators.factories import (
|
|
13
|
+
data,
|
|
12
14
|
distribution,
|
|
13
15
|
integer,
|
|
14
16
|
scalar,
|
|
@@ -48,19 +50,19 @@ class TreeDatasetGenerator(DatasetGenerator):
|
|
|
48
50
|
max_time: cfg.Scalar = np.inf
|
|
49
51
|
init_state: str | None = None
|
|
50
52
|
sampling_probability_at_present: cfg.Scalar = 0.0
|
|
53
|
+
timeout: float = np.inf
|
|
51
54
|
|
|
52
55
|
@abstractmethod
|
|
53
56
|
def _get_events(self, data: dict[str, Any]) -> list[Event]: ...
|
|
54
57
|
|
|
55
|
-
def simulate_one(self,
|
|
56
|
-
events = self._get_events(data)
|
|
58
|
+
def simulate_one(self, data: dict[str, Any], seed: int | None = None) -> Tree:
|
|
57
59
|
init_state = (
|
|
58
60
|
self.init_state
|
|
59
61
|
if self.init_state is None
|
|
60
62
|
else self.init_state.format(**data)
|
|
61
63
|
)
|
|
62
64
|
return simulate_tree(
|
|
63
|
-
events=
|
|
65
|
+
events=self._get_events(data),
|
|
64
66
|
min_tips=integer(self.min_tips, data),
|
|
65
67
|
max_tips=integer(self.max_tips, data),
|
|
66
68
|
max_time=scalar(self.max_time, data),
|
|
@@ -68,15 +70,27 @@ class TreeDatasetGenerator(DatasetGenerator):
|
|
|
68
70
|
sampling_probability_at_present=scalar(
|
|
69
71
|
self.sampling_probability_at_present, data
|
|
70
72
|
),
|
|
71
|
-
seed=
|
|
73
|
+
seed=seed,
|
|
74
|
+
timeout=self.timeout,
|
|
72
75
|
)
|
|
73
76
|
|
|
74
|
-
def
|
|
75
|
-
self,
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
77
|
+
def generate_one(
|
|
78
|
+
self,
|
|
79
|
+
filename: str,
|
|
80
|
+
context: dict[str, Any] | None = None,
|
|
81
|
+
seed: int | None = None,
|
|
82
|
+
) -> dict[str, Any]:
|
|
83
|
+
d = {"file_id": Path(filename).stem}
|
|
84
|
+
rng = default_rng(seed)
|
|
85
|
+
while True:
|
|
86
|
+
try:
|
|
87
|
+
d.update(data(context, rng))
|
|
88
|
+
tree = self.simulate_one(d, seed)
|
|
89
|
+
dump_newick(tree, f"{filename}.nwk")
|
|
90
|
+
break
|
|
91
|
+
except TimeoutError:
|
|
92
|
+
print("Simulation timed out, retrying with different parameters...")
|
|
93
|
+
return d
|
|
80
94
|
|
|
81
95
|
|
|
82
96
|
class CanonicalTreeDatasetGenerator(TreeDatasetGenerator):
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import time
|
|
2
3
|
from collections.abc import Sequence
|
|
3
4
|
|
|
4
5
|
import joblib
|
|
@@ -21,7 +22,8 @@ def simulate_tree(
|
|
|
21
22
|
init_state: str | None = None,
|
|
22
23
|
sampling_probability_at_present: float = 0.0,
|
|
23
24
|
seed: int | None = None,
|
|
24
|
-
|
|
25
|
+
timeout: float = np.inf,
|
|
26
|
+
) -> Tree:
|
|
25
27
|
if max_time == np.inf and max_tips == MAX_TIPS:
|
|
26
28
|
raise ValueError("Either max_time or max_tips must be specified.")
|
|
27
29
|
|
|
@@ -41,14 +43,18 @@ def simulate_tree(
|
|
|
41
43
|
raise ValueError(f"Init state {init_state} not found in event states: {states}")
|
|
42
44
|
|
|
43
45
|
rng = default_rng(seed)
|
|
46
|
+
start_clock = time.perf_counter()
|
|
44
47
|
while True:
|
|
45
48
|
model = Model(init_state, events)
|
|
46
49
|
current_time = 0.0
|
|
47
50
|
change_times = sorted(set(t for e in events for t in e.rate.change_times))
|
|
48
51
|
next_change_time = change_times.pop(0) if change_times else np.inf
|
|
49
|
-
|
|
50
52
|
target_n_tips = rng.integers(min_tips, max_tips) if max_time == np.inf else None
|
|
53
|
+
|
|
51
54
|
while current_time < max_time:
|
|
55
|
+
if time.perf_counter() - start_clock > timeout:
|
|
56
|
+
raise TimeoutError("Simulation timed out.")
|
|
57
|
+
|
|
52
58
|
events = model.events
|
|
53
59
|
rates = [e.get_propensity(model, current_time) for e in events]
|
|
54
60
|
|
|
@@ -98,26 +104,35 @@ def generate_trees(
|
|
|
98
104
|
sampling_probability_at_present: float = 0.0,
|
|
99
105
|
seed: int | None = None,
|
|
100
106
|
n_jobs: int = -1,
|
|
107
|
+
timeout: float = np.inf,
|
|
101
108
|
) -> None:
|
|
109
|
+
def _simulate_tree(seed: int) -> Tree:
|
|
110
|
+
while True:
|
|
111
|
+
try:
|
|
112
|
+
return simulate_tree(
|
|
113
|
+
events=events,
|
|
114
|
+
min_tips=min_tips,
|
|
115
|
+
max_tips=max_tips,
|
|
116
|
+
max_time=max_time,
|
|
117
|
+
init_state=init_state,
|
|
118
|
+
sampling_probability_at_present=sampling_probability_at_present,
|
|
119
|
+
seed=seed,
|
|
120
|
+
timeout=timeout,
|
|
121
|
+
)
|
|
122
|
+
except TimeoutError:
|
|
123
|
+
print("Simulation timed out, retrying with a different seed...")
|
|
124
|
+
seed += 1
|
|
125
|
+
|
|
102
126
|
if os.path.exists(output_dir):
|
|
103
127
|
raise FileExistsError(f"Output directory {output_dir} already exists")
|
|
104
128
|
os.mkdir(output_dir)
|
|
105
129
|
|
|
106
130
|
rng = default_rng(seed)
|
|
107
131
|
jobs = joblib.Parallel(n_jobs=n_jobs, return_as="generator_unordered")(
|
|
108
|
-
joblib.delayed(
|
|
109
|
-
events=events,
|
|
110
|
-
min_tips=min_tips,
|
|
111
|
-
max_tips=max_tips,
|
|
112
|
-
max_time=max_time,
|
|
113
|
-
init_state=init_state,
|
|
114
|
-
sampling_probability_at_present=sampling_probability_at_present,
|
|
115
|
-
seed=int(rng.integers(2**32)),
|
|
116
|
-
)
|
|
132
|
+
joblib.delayed(_simulate_tree)(seed=int(rng.integers(2**32)))
|
|
117
133
|
for _ in range(n_trees)
|
|
118
134
|
)
|
|
119
135
|
for i, tree in tqdm(
|
|
120
136
|
enumerate(jobs), total=n_trees, desc=f"Generating trees in {output_dir}..."
|
|
121
137
|
):
|
|
122
|
-
|
|
123
|
-
dump_newick(tree, os.path.join(output_dir, f"{i}.nwk"))
|
|
138
|
+
dump_newick(tree, os.path.join(output_dir, f"{i}.nwk"))
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
phylogenie/__init__.py,sha256=T2mRLsYtoLlWt8GlxrrUnfXJ9XVioq7hTvVq3uJpwQI,2215
|
|
2
2
|
phylogenie/generators/__init__.py,sha256=zsOxy28-9j9alOQLIgrOAFfmM58NNHO_NEtW-KXQXAY,888
|
|
3
|
-
phylogenie/generators/alisim.py,sha256=
|
|
3
|
+
phylogenie/generators/alisim.py,sha256=3mANgyQrlozhslV3_ryt-m4ItkRcKKRLufWf6SNBTnQ,2781
|
|
4
4
|
phylogenie/generators/configs.py,sha256=AiiFS6rpH9BPwDKCkT4SVrRzfLFFrwRCJM4CRj0Srdk,1072
|
|
5
|
-
phylogenie/generators/dataset.py,sha256=
|
|
6
|
-
phylogenie/generators/factories.py,sha256=
|
|
7
|
-
phylogenie/generators/trees.py,sha256=
|
|
5
|
+
phylogenie/generators/dataset.py,sha256=loVKC_1G7gzkPDN9W3GF-Rj9od8AeOJgIC0aJJa-4KA,2110
|
|
6
|
+
phylogenie/generators/factories.py,sha256=jLwDuq0mrmDz2U5rZM19KJ2hSpamG3r6zb83YCc6snA,7619
|
|
7
|
+
phylogenie/generators/trees.py,sha256=q03WPG82M4ucp-jyjoKEBy7TKMBzD3RkKn8hS0G0-i0,10463
|
|
8
8
|
phylogenie/generators/typeguards.py,sha256=yj4VkhOaUXJ2OrY-6zhOeY9C4yKIQxjZtk2d-vIxttQ,828
|
|
9
9
|
phylogenie/io.py,sha256=y7nQIvLgCvqELsXFKfm1GgKJO_saoQ-7zQpE3Kvajzc,3509
|
|
10
10
|
phylogenie/main.py,sha256=vtvSpQxBNlYABoFQ25czl-l3fIr4QRo3svWVd-jcArw,1170
|
|
@@ -20,13 +20,13 @@ phylogenie/treesimulator/events/__init__.py,sha256=UGfvXOVJ_ZAkk_8sBPihjmxciiaEn
|
|
|
20
20
|
phylogenie/treesimulator/events/contact_tracing.py,sha256=_nJ85yhgGkeruQgMHvGpDYoyhheBf8M4LgZWiWdi5dY,4801
|
|
21
21
|
phylogenie/treesimulator/events/core.py,sha256=JokGmieAv2xEX7KsjBWZr05jHN1jB-XZbpxe9gwdbDA,7953
|
|
22
22
|
phylogenie/treesimulator/events/mutations.py,sha256=xkXUIppbLIWZqKwVf-hi7d-_pS42TG2EPVfJA_grxBg,3443
|
|
23
|
-
phylogenie/treesimulator/gillespie.py,sha256=
|
|
23
|
+
phylogenie/treesimulator/gillespie.py,sha256=naoxPyZixWVkd5f7B3KhEtOFiQI4NDIp_589NCLTHKM,4831
|
|
24
24
|
phylogenie/treesimulator/model.py,sha256=0Im6cFTlpMlJrSP4pTTKtvLT9qrQWV8MSTesAsBxT8g,5422
|
|
25
25
|
phylogenie/typeguards.py,sha256=JtqmbEWJZBRHbWgCvcl6nrWm3VcBfzRbklbTBYHItn0,1325
|
|
26
26
|
phylogenie/typings.py,sha256=GknvAFXyiaWeeYJ8Lk5d6E2VHT-xW6ONEojYbtJYiB8,476
|
|
27
27
|
phylogenie/utils.py,sha256=pCg9ob0RpLUHwM49x4knKxL4FNPr3-EU_6zMXsvxtAg,370
|
|
28
|
-
phylogenie-2.1.
|
|
29
|
-
phylogenie-2.1.
|
|
30
|
-
phylogenie-2.1.
|
|
31
|
-
phylogenie-2.1.
|
|
32
|
-
phylogenie-2.1.
|
|
28
|
+
phylogenie-2.1.3.dist-info/LICENSE.txt,sha256=NUrDqElK-eD3I0WqC004CJsy6cs0JgsAoebDv_42-pw,1071
|
|
29
|
+
phylogenie-2.1.3.dist-info/METADATA,sha256=_jiRJ7jNb-CPkhqExE3sWc6RXz8rrbnnxdgXZHbNScc,5375
|
|
30
|
+
phylogenie-2.1.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
31
|
+
phylogenie-2.1.3.dist-info/entry_points.txt,sha256=Rt6_usN0FkBX1ZfiqCirjMN9FKOgFLG8rydcQ8kugeE,51
|
|
32
|
+
phylogenie-2.1.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|