phylogenie 2.1.1__py3-none-any.whl → 2.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,9 +3,10 @@ import subprocess
3
3
  from pathlib import Path
4
4
  from typing import Any, Literal
5
5
 
6
- from numpy.random import Generator
6
+ from numpy.random import Generator, default_rng
7
7
 
8
8
  from phylogenie.generators.dataset import DatasetGenerator, DataType
9
+ from phylogenie.generators.factories import data
9
10
  from phylogenie.generators.trees import TreeDatasetGeneratorConfig
10
11
  from phylogenie.io import dump_newick
11
12
 
@@ -42,9 +43,12 @@ class AliSimDatasetGenerator(DatasetGenerator):
42
43
  subprocess.run(command, check=True, stdout=subprocess.DEVNULL)
43
44
  subprocess.run(["rm", f"{tree_file}.log"], check=True)
44
45
 
45
- def _generate_one(
46
- self, filename: str, rng: Generator, data: dict[str, Any]
47
- ) -> None:
46
+ def generate_one(
47
+ self,
48
+ filename: str,
49
+ context: dict[str, Any] | None = None,
50
+ seed: int | None = None,
51
+ ) -> dict[str, Any]:
48
52
  if self.keep_trees:
49
53
  base_dir, file_id = Path(filename).parent, Path(filename).stem
50
54
  trees_dir = os.path.join(base_dir, TREES_DIRNAME)
@@ -57,16 +61,24 @@ class AliSimDatasetGenerator(DatasetGenerator):
57
61
  tree_filename = f"{filename}.temp-tree"
58
62
  msa_filename = filename
59
63
 
60
- tree = self.trees.simulate_one(rng, data)
61
- if tree is None:
62
- return
64
+ d: dict[str, Any] = {"file_id": Path(msa_filename).stem}
65
+ rng = default_rng(seed)
66
+ while True:
67
+ d.update(data(context, rng))
68
+ try:
69
+ tree = self.trees.simulate_one(d, seed)
70
+ break
71
+ except TimeoutError:
72
+ print(
73
+ "Tree simulation timed out, retrying with different parameters..."
74
+ )
63
75
 
64
76
  for leaf in tree.get_leaves():
65
77
  leaf.id += f"|{leaf.get_time()}"
66
78
  dump_newick(tree, f"{tree_filename}.nwk")
67
79
 
68
- self._generate_one_from_tree(
69
- filename=msa_filename, tree_file=f"{tree_filename}.nwk", rng=rng, data=data
70
- )
80
+ self._generate_one_from_tree(msa_filename, f"{tree_filename}.nwk", rng, d)
71
81
  if not self.keep_trees:
72
82
  os.remove(f"{tree_filename}.nwk")
83
+
84
+ return d
@@ -1,16 +1,13 @@
1
1
  import os
2
2
  from abc import ABC, abstractmethod
3
3
  from enum import Enum
4
- from itertools import product
5
4
  from typing import Any
6
5
 
7
6
  import joblib
8
- import numpy as np
9
7
  import pandas as pd
10
8
  from numpy.random import Generator, default_rng
11
9
  from tqdm import tqdm
12
10
 
13
- from phylogenie.generators.factories import distribution
14
11
  from phylogenie.utils import Distribution, StrictBaseModel
15
12
 
16
13
 
@@ -31,15 +28,12 @@ class DatasetGenerator(ABC, StrictBaseModel):
31
28
  context: dict[str, Distribution] | None = None
32
29
 
33
30
  @abstractmethod
34
- def _generate_one(
35
- self, filename: str, rng: Generator, data: dict[str, Any]
36
- ) -> None: ...
37
-
38
31
  def generate_one(
39
- self, filename: str, data: dict[str, Any] | None = None, seed: int | None = None
40
- ) -> None:
41
- data = {} if data is None else data
42
- self._generate_one(filename=filename, rng=default_rng(seed), data=data)
32
+ self,
33
+ filename: str,
34
+ context: dict[str, Any] | None = None,
35
+ seed: int | None = None,
36
+ ) -> dict[str, Any]: ...
43
37
 
44
38
  def _generate(self, rng: Generator, n_samples: int, output_dir: str) -> None:
45
39
  if os.path.exists(output_dir):
@@ -53,24 +47,18 @@ class DatasetGenerator(ABC, StrictBaseModel):
53
47
  )
54
48
  os.makedirs(data_dir)
55
49
 
56
- data: list[dict[str, Any]] = [{} for _ in range(n_samples)]
57
- if self.context is not None:
58
- for d, (k, v) in product(data, self.context.items()):
59
- dist = distribution(v, d)
60
- d[k] = np.array(getattr(rng, dist.type)(**dist.args)).tolist()
61
- df = pd.DataFrame([{"file_id": str(i), **d} for i, d in enumerate(data)])
62
- df.to_csv(os.path.join(output_dir, METADATA_FILENAME), index=False)
63
-
64
50
  jobs = joblib.Parallel(n_jobs=self.n_jobs, return_as="generator_unordered")(
65
51
  joblib.delayed(self.generate_one)(
66
- filename=os.path.join(data_dir, str(i)),
67
- data=data[i],
68
52
  seed=int(rng.integers(2**32)),
53
+ filename=os.path.join(data_dir, str(i)),
54
+ context=self.context,
69
55
  )
70
56
  for i in range(n_samples)
71
57
  )
72
- for _ in tqdm(jobs, total=n_samples, desc=f"Generating {data_dir}..."):
73
- pass
58
+ df = pd.DataFrame(
59
+ [r for r in tqdm(jobs, total=n_samples, desc=f"Generating {data_dir}...")]
60
+ )
61
+ df.to_csv(os.path.join(output_dir, METADATA_FILENAME), index=False)
74
62
 
75
63
  def generate(self) -> None:
76
64
  rng = default_rng(self.seed)
@@ -1,6 +1,7 @@
1
1
  from typing import Any
2
2
 
3
3
  import numpy as np
4
+ from numpy.random import Generator
4
5
 
5
6
  import phylogenie.generators.configs as cfg
6
7
  import phylogenie.generators.typeguards as ctg
@@ -30,14 +31,6 @@ def _eval_expression(expression: str, data: dict[str, Any]) -> Any:
30
31
  ).tolist()
31
32
 
32
33
 
33
- def distribution(x: Distribution, data: dict[str, Any]) -> Distribution:
34
- args = x.args
35
- for arg_name, arg_value in args.items():
36
- if isinstance(arg_value, str):
37
- args[arg_name] = _eval_expression(arg_value, data)
38
- return Distribution(type=x.type, **args)
39
-
40
-
41
34
  def integer(x: cfg.Integer, data: dict[str, Any]) -> int:
42
35
  if isinstance(x, str):
43
36
  e = _eval_expression(x, data)
@@ -209,3 +202,21 @@ def skyline_matrix(
209
202
  value = [[[e] * N] * M if isinstance(e, pgt.Scalar) else e for e in value]
210
203
 
211
204
  return SkylineMatrix(value=value, change_times=change_times)
205
+
206
+
207
+ def distribution(x: Distribution, data: dict[str, Any]) -> Distribution:
208
+ args = x.args
209
+ for arg_name, arg_value in args.items():
210
+ if isinstance(arg_value, str):
211
+ args[arg_name] = _eval_expression(arg_value, data)
212
+ return Distribution(type=x.type, **args)
213
+
214
+
215
+ def data(context: dict[str, Distribution] | None, rng: Generator) -> dict[str, Any]:
216
+ if context is None:
217
+ return {}
218
+ data: dict[str, Any] = {}
219
+ for k, v in context.items():
220
+ dist = distribution(v, data)
221
+ data[k] = np.array(getattr(rng, dist.type)(**dist.args)).tolist()
222
+ return data
@@ -1,14 +1,16 @@
1
1
  from abc import abstractmethod
2
2
  from enum import Enum
3
+ from pathlib import Path
3
4
  from typing import Annotated, Any, Literal
4
5
 
5
6
  import numpy as np
6
- from numpy.random import Generator
7
+ from numpy.random import default_rng
7
8
  from pydantic import Field
8
9
 
9
10
  import phylogenie.generators.configs as cfg
10
11
  from phylogenie.generators.dataset import DatasetGenerator, DataType
11
12
  from phylogenie.generators.factories import (
13
+ data,
12
14
  distribution,
13
15
  integer,
14
16
  scalar,
@@ -48,19 +50,19 @@ class TreeDatasetGenerator(DatasetGenerator):
48
50
  max_time: cfg.Scalar = np.inf
49
51
  init_state: str | None = None
50
52
  sampling_probability_at_present: cfg.Scalar = 0.0
53
+ timeout: float = np.inf
51
54
 
52
55
  @abstractmethod
53
56
  def _get_events(self, data: dict[str, Any]) -> list[Event]: ...
54
57
 
55
- def simulate_one(self, rng: Generator, data: dict[str, Any]) -> Tree | None:
56
- events = self._get_events(data)
58
+ def simulate_one(self, data: dict[str, Any], seed: int | None = None) -> Tree:
57
59
  init_state = (
58
60
  self.init_state
59
61
  if self.init_state is None
60
62
  else self.init_state.format(**data)
61
63
  )
62
64
  return simulate_tree(
63
- events=events,
65
+ events=self._get_events(data),
64
66
  min_tips=integer(self.min_tips, data),
65
67
  max_tips=integer(self.max_tips, data),
66
68
  max_time=scalar(self.max_time, data),
@@ -68,15 +70,27 @@ class TreeDatasetGenerator(DatasetGenerator):
68
70
  sampling_probability_at_present=scalar(
69
71
  self.sampling_probability_at_present, data
70
72
  ),
71
- seed=int(rng.integers(2**32)),
73
+ seed=seed,
74
+ timeout=self.timeout,
72
75
  )
73
76
 
74
- def _generate_one(
75
- self, filename: str, rng: Generator, data: dict[str, Any]
76
- ) -> None:
77
- tree = self.simulate_one(rng, data)
78
- if tree is not None:
79
- dump_newick(tree, f"{filename}.nwk")
77
+ def generate_one(
78
+ self,
79
+ filename: str,
80
+ context: dict[str, Any] | None = None,
81
+ seed: int | None = None,
82
+ ) -> dict[str, Any]:
83
+ d = {"file_id": Path(filename).stem}
84
+ rng = default_rng(seed)
85
+ while True:
86
+ try:
87
+ d.update(data(context, rng))
88
+ tree = self.simulate_one(d, seed)
89
+ dump_newick(tree, f"{filename}.nwk")
90
+ break
91
+ except TimeoutError:
92
+ print("Simulation timed out, retrying with different parameters...")
93
+ return d
80
94
 
81
95
 
82
96
  class CanonicalTreeDatasetGenerator(TreeDatasetGenerator):
phylogenie/tree.py CHANGED
@@ -73,6 +73,14 @@ class Tree:
73
73
  def delete(self, key: str) -> None:
74
74
  del self._features[key]
75
75
 
76
+ def copy(self):
77
+ new_tree = Tree(self.id, self.branch_length)
78
+ for key, value in self._features.items():
79
+ new_tree.set(key, value)
80
+ for child in self.children:
81
+ new_tree.add_child(child.copy())
82
+ return new_tree
83
+
76
84
  def __iter__(self) -> Iterator["Tree"]:
77
85
  return self.preorder_traversal()
78
86
 
@@ -1,4 +1,5 @@
1
1
  import os
2
+ import time
2
3
  from collections.abc import Sequence
3
4
 
4
5
  import joblib
@@ -21,7 +22,8 @@ def simulate_tree(
21
22
  init_state: str | None = None,
22
23
  sampling_probability_at_present: float = 0.0,
23
24
  seed: int | None = None,
24
- ) -> Tree | None:
25
+ timeout: float = np.inf,
26
+ ) -> Tree:
25
27
  if max_time == np.inf and max_tips == MAX_TIPS:
26
28
  raise ValueError("Either max_time or max_tips must be specified.")
27
29
 
@@ -41,14 +43,18 @@ def simulate_tree(
41
43
  raise ValueError(f"Init state {init_state} not found in event states: {states}")
42
44
 
43
45
  rng = default_rng(seed)
46
+ start_clock = time.perf_counter()
44
47
  while True:
45
48
  model = Model(init_state, events)
46
49
  current_time = 0.0
47
50
  change_times = sorted(set(t for e in events for t in e.rate.change_times))
48
51
  next_change_time = change_times.pop(0) if change_times else np.inf
49
-
50
52
  target_n_tips = rng.integers(min_tips, max_tips) if max_time == np.inf else None
53
+
51
54
  while current_time < max_time:
55
+ if time.perf_counter() - start_clock > timeout:
56
+ raise TimeoutError("Simulation timed out.")
57
+
52
58
  events = model.events
53
59
  rates = [e.get_propensity(model, current_time) for e in events]
54
60
 
@@ -98,26 +104,35 @@ def generate_trees(
98
104
  sampling_probability_at_present: float = 0.0,
99
105
  seed: int | None = None,
100
106
  n_jobs: int = -1,
107
+ timeout: float = np.inf,
101
108
  ) -> None:
109
+ def _simulate_tree(seed: int) -> Tree:
110
+ while True:
111
+ try:
112
+ return simulate_tree(
113
+ events=events,
114
+ min_tips=min_tips,
115
+ max_tips=max_tips,
116
+ max_time=max_time,
117
+ init_state=init_state,
118
+ sampling_probability_at_present=sampling_probability_at_present,
119
+ seed=seed,
120
+ timeout=timeout,
121
+ )
122
+ except TimeoutError:
123
+ print("Simulation timed out, retrying with a different seed...")
124
+ seed += 1
125
+
102
126
  if os.path.exists(output_dir):
103
127
  raise FileExistsError(f"Output directory {output_dir} already exists")
104
128
  os.mkdir(output_dir)
105
129
 
106
130
  rng = default_rng(seed)
107
131
  jobs = joblib.Parallel(n_jobs=n_jobs, return_as="generator_unordered")(
108
- joblib.delayed(simulate_tree)(
109
- events=events,
110
- min_tips=min_tips,
111
- max_tips=max_tips,
112
- max_time=max_time,
113
- init_state=init_state,
114
- sampling_probability_at_present=sampling_probability_at_present,
115
- seed=int(rng.integers(2**32)),
116
- )
132
+ joblib.delayed(_simulate_tree)(seed=int(rng.integers(2**32)))
117
133
  for _ in range(n_trees)
118
134
  )
119
135
  for i, tree in tqdm(
120
136
  enumerate(jobs), total=n_trees, desc=f"Generating trees in {output_dir}..."
121
137
  ):
122
- if tree is not None:
123
- dump_newick(tree, os.path.join(output_dir, f"{i}.nwk"))
138
+ dump_newick(tree, os.path.join(output_dir, f"{i}.nwk"))
@@ -1,7 +1,6 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from collections import defaultdict
3
3
  from collections.abc import Sequence
4
- from copy import deepcopy
5
4
  from dataclasses import dataclass
6
5
  from typing import Any
7
6
 
@@ -122,7 +121,7 @@ class Model:
122
121
  return self._population[id].state
123
122
 
124
123
  def get_sampled_tree(self) -> Tree:
125
- tree = deepcopy(self._tree)
124
+ tree = self._tree.copy()
126
125
  for node in list(tree.postorder_traversal()):
127
126
  if node.id not in self._sampled and not node.children:
128
127
  if node.parent is None:
@@ -142,7 +141,7 @@ class Model:
142
141
  return tree
143
142
 
144
143
  def get_full_tree(self) -> Tree:
145
- return deepcopy(self._tree)
144
+ return self._tree.copy()
146
145
 
147
146
  def get_population(self, states: str | None = None) -> list[int]:
148
147
  if states is None:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: phylogenie
3
- Version: 2.1.1
3
+ Version: 2.1.3
4
4
  Summary: Generate phylogenetic datasets with minimal setup effort
5
5
  Author: Gabriele Marino
6
6
  Author-email: gabmarino.8601@gmail.com
@@ -1,10 +1,10 @@
1
1
  phylogenie/__init__.py,sha256=T2mRLsYtoLlWt8GlxrrUnfXJ9XVioq7hTvVq3uJpwQI,2215
2
2
  phylogenie/generators/__init__.py,sha256=zsOxy28-9j9alOQLIgrOAFfmM58NNHO_NEtW-KXQXAY,888
3
- phylogenie/generators/alisim.py,sha256=dDqlSwLDbRE2u5SZlsq1mArobTBtuk0aeXY3m1N-bWA,2374
3
+ phylogenie/generators/alisim.py,sha256=3mANgyQrlozhslV3_ryt-m4ItkRcKKRLufWf6SNBTnQ,2781
4
4
  phylogenie/generators/configs.py,sha256=AiiFS6rpH9BPwDKCkT4SVrRzfLFFrwRCJM4CRj0Srdk,1072
5
- phylogenie/generators/dataset.py,sha256=wYtb5fxjrM0bD9KVDlgZPpiR4ACezdKKfsmsXyc4__0,2755
6
- phylogenie/generators/factories.py,sha256=14p1wvJtjdXgM4mPqV9vX34HYcKpemx9TPfemvoB5Wo,7265
7
- phylogenie/generators/trees.py,sha256=EEj5KPuu7FSOoti_gGTmgWgiX68rEKFh57Hp6BDw8po,10049
5
+ phylogenie/generators/dataset.py,sha256=loVKC_1G7gzkPDN9W3GF-Rj9od8AeOJgIC0aJJa-4KA,2110
6
+ phylogenie/generators/factories.py,sha256=jLwDuq0mrmDz2U5rZM19KJ2hSpamG3r6zb83YCc6snA,7619
7
+ phylogenie/generators/trees.py,sha256=q03WPG82M4ucp-jyjoKEBy7TKMBzD3RkKn8hS0G0-i0,10463
8
8
  phylogenie/generators/typeguards.py,sha256=yj4VkhOaUXJ2OrY-6zhOeY9C4yKIQxjZtk2d-vIxttQ,828
9
9
  phylogenie/io.py,sha256=y7nQIvLgCvqELsXFKfm1GgKJO_saoQ-7zQpE3Kvajzc,3509
10
10
  phylogenie/main.py,sha256=vtvSpQxBNlYABoFQ25czl-l3fIr4QRo3svWVd-jcArw,1170
@@ -14,19 +14,19 @@ phylogenie/skyline/__init__.py,sha256=7pF4CUb4ZCLzNYJNhOjpuTOLTRhlK7L6ugfccNqjIG
14
14
  phylogenie/skyline/matrix.py,sha256=Gl8OgKjtieG0NwPYiPimKI36gefV8fm_OeorjdXxPTs,9146
15
15
  phylogenie/skyline/parameter.py,sha256=EM9qlPt0JhMBy3TbztM0dj24BaGNEy8KWKdTObDKhbI,4644
16
16
  phylogenie/skyline/vector.py,sha256=bJP7_FNX_Klt6wXqsyfj0KX3VNj6-dIhzCKSJuQcOV0,7115
17
- phylogenie/tree.py,sha256=fmCLDNIls3VjY413b-khzj-2jlaQ9rJo7AFN1cz4zXk,2411
17
+ phylogenie/tree.py,sha256=Cum74mTdmgfGXk25dnvUngr4zDYRyWFq5zThBh0QFog,2677
18
18
  phylogenie/treesimulator/__init__.py,sha256=XG_xwETKWgDmCihqNUFCcMHtFg4WvZu5qbqWn9Dndt8,879
19
19
  phylogenie/treesimulator/events/__init__.py,sha256=UGfvXOVJ_ZAkk_8sBPihjmxciiaEnXZEPFIY53sttWI,940
20
20
  phylogenie/treesimulator/events/contact_tracing.py,sha256=_nJ85yhgGkeruQgMHvGpDYoyhheBf8M4LgZWiWdi5dY,4801
21
21
  phylogenie/treesimulator/events/core.py,sha256=JokGmieAv2xEX7KsjBWZr05jHN1jB-XZbpxe9gwdbDA,7953
22
22
  phylogenie/treesimulator/events/mutations.py,sha256=xkXUIppbLIWZqKwVf-hi7d-_pS42TG2EPVfJA_grxBg,3443
23
- phylogenie/treesimulator/gillespie.py,sha256=TMDNKBkFwVyAEhBlbwxCTA61GuGwP-42HxpsAVXiU0s,4275
24
- phylogenie/treesimulator/model.py,sha256=bWoFiO-99tTjHtueKPvGV9RSHQKYJ625XP1dvYbP1_Q,5454
23
+ phylogenie/treesimulator/gillespie.py,sha256=naoxPyZixWVkd5f7B3KhEtOFiQI4NDIp_589NCLTHKM,4831
24
+ phylogenie/treesimulator/model.py,sha256=0Im6cFTlpMlJrSP4pTTKtvLT9qrQWV8MSTesAsBxT8g,5422
25
25
  phylogenie/typeguards.py,sha256=JtqmbEWJZBRHbWgCvcl6nrWm3VcBfzRbklbTBYHItn0,1325
26
26
  phylogenie/typings.py,sha256=GknvAFXyiaWeeYJ8Lk5d6E2VHT-xW6ONEojYbtJYiB8,476
27
27
  phylogenie/utils.py,sha256=pCg9ob0RpLUHwM49x4knKxL4FNPr3-EU_6zMXsvxtAg,370
28
- phylogenie-2.1.1.dist-info/LICENSE.txt,sha256=NUrDqElK-eD3I0WqC004CJsy6cs0JgsAoebDv_42-pw,1071
29
- phylogenie-2.1.1.dist-info/METADATA,sha256=Ay--Y2G4F1KwK9GgJFhtiTwGyKJ815JUCLYbXMyek1o,5375
30
- phylogenie-2.1.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
31
- phylogenie-2.1.1.dist-info/entry_points.txt,sha256=Rt6_usN0FkBX1ZfiqCirjMN9FKOgFLG8rydcQ8kugeE,51
32
- phylogenie-2.1.1.dist-info/RECORD,,
28
+ phylogenie-2.1.3.dist-info/LICENSE.txt,sha256=NUrDqElK-eD3I0WqC004CJsy6cs0JgsAoebDv_42-pw,1071
29
+ phylogenie-2.1.3.dist-info/METADATA,sha256=_jiRJ7jNb-CPkhqExE3sWc6RXz8rrbnnxdgXZHbNScc,5375
30
+ phylogenie-2.1.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
31
+ phylogenie-2.1.3.dist-info/entry_points.txt,sha256=Rt6_usN0FkBX1ZfiqCirjMN9FKOgFLG8rydcQ8kugeE,51
32
+ phylogenie-2.1.3.dist-info/RECORD,,