phylogenie 2.1.2__tar.gz → 2.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of phylogenie might be problematic. Click here for more details.

Files changed (31) hide show
  1. {phylogenie-2.1.2 → phylogenie-2.1.3}/PKG-INFO +1 -1
  2. {phylogenie-2.1.2 → phylogenie-2.1.3}/phylogenie/generators/alisim.py +22 -10
  3. {phylogenie-2.1.2 → phylogenie-2.1.3}/phylogenie/generators/dataset.py +11 -23
  4. {phylogenie-2.1.2 → phylogenie-2.1.3}/phylogenie/generators/factories.py +19 -8
  5. {phylogenie-2.1.2 → phylogenie-2.1.3}/phylogenie/generators/trees.py +25 -11
  6. {phylogenie-2.1.2 → phylogenie-2.1.3}/phylogenie/treesimulator/gillespie.py +28 -13
  7. {phylogenie-2.1.2 → phylogenie-2.1.3}/pyproject.toml +1 -1
  8. {phylogenie-2.1.2 → phylogenie-2.1.3}/LICENSE.txt +0 -0
  9. {phylogenie-2.1.2 → phylogenie-2.1.3}/README.md +0 -0
  10. {phylogenie-2.1.2 → phylogenie-2.1.3}/phylogenie/__init__.py +0 -0
  11. {phylogenie-2.1.2 → phylogenie-2.1.3}/phylogenie/generators/__init__.py +0 -0
  12. {phylogenie-2.1.2 → phylogenie-2.1.3}/phylogenie/generators/configs.py +0 -0
  13. {phylogenie-2.1.2 → phylogenie-2.1.3}/phylogenie/generators/typeguards.py +0 -0
  14. {phylogenie-2.1.2 → phylogenie-2.1.3}/phylogenie/io.py +0 -0
  15. {phylogenie-2.1.2 → phylogenie-2.1.3}/phylogenie/main.py +0 -0
  16. {phylogenie-2.1.2 → phylogenie-2.1.3}/phylogenie/msa.py +0 -0
  17. {phylogenie-2.1.2 → phylogenie-2.1.3}/phylogenie/py.typed +0 -0
  18. {phylogenie-2.1.2 → phylogenie-2.1.3}/phylogenie/skyline/__init__.py +0 -0
  19. {phylogenie-2.1.2 → phylogenie-2.1.3}/phylogenie/skyline/matrix.py +0 -0
  20. {phylogenie-2.1.2 → phylogenie-2.1.3}/phylogenie/skyline/parameter.py +0 -0
  21. {phylogenie-2.1.2 → phylogenie-2.1.3}/phylogenie/skyline/vector.py +0 -0
  22. {phylogenie-2.1.2 → phylogenie-2.1.3}/phylogenie/tree.py +0 -0
  23. {phylogenie-2.1.2 → phylogenie-2.1.3}/phylogenie/treesimulator/__init__.py +0 -0
  24. {phylogenie-2.1.2 → phylogenie-2.1.3}/phylogenie/treesimulator/events/__init__.py +0 -0
  25. {phylogenie-2.1.2 → phylogenie-2.1.3}/phylogenie/treesimulator/events/contact_tracing.py +0 -0
  26. {phylogenie-2.1.2 → phylogenie-2.1.3}/phylogenie/treesimulator/events/core.py +0 -0
  27. {phylogenie-2.1.2 → phylogenie-2.1.3}/phylogenie/treesimulator/events/mutations.py +0 -0
  28. {phylogenie-2.1.2 → phylogenie-2.1.3}/phylogenie/treesimulator/model.py +0 -0
  29. {phylogenie-2.1.2 → phylogenie-2.1.3}/phylogenie/typeguards.py +0 -0
  30. {phylogenie-2.1.2 → phylogenie-2.1.3}/phylogenie/typings.py +0 -0
  31. {phylogenie-2.1.2 → phylogenie-2.1.3}/phylogenie/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: phylogenie
3
- Version: 2.1.2
3
+ Version: 2.1.3
4
4
  Summary: Generate phylogenetic datasets with minimal setup effort
5
5
  Author: Gabriele Marino
6
6
  Author-email: gabmarino.8601@gmail.com
@@ -3,9 +3,10 @@ import subprocess
3
3
  from pathlib import Path
4
4
  from typing import Any, Literal
5
5
 
6
- from numpy.random import Generator
6
+ from numpy.random import Generator, default_rng
7
7
 
8
8
  from phylogenie.generators.dataset import DatasetGenerator, DataType
9
+ from phylogenie.generators.factories import data
9
10
  from phylogenie.generators.trees import TreeDatasetGeneratorConfig
10
11
  from phylogenie.io import dump_newick
11
12
 
@@ -42,9 +43,12 @@ class AliSimDatasetGenerator(DatasetGenerator):
42
43
  subprocess.run(command, check=True, stdout=subprocess.DEVNULL)
43
44
  subprocess.run(["rm", f"{tree_file}.log"], check=True)
44
45
 
45
- def _generate_one(
46
- self, filename: str, rng: Generator, data: dict[str, Any]
47
- ) -> None:
46
+ def generate_one(
47
+ self,
48
+ filename: str,
49
+ context: dict[str, Any] | None = None,
50
+ seed: int | None = None,
51
+ ) -> dict[str, Any]:
48
52
  if self.keep_trees:
49
53
  base_dir, file_id = Path(filename).parent, Path(filename).stem
50
54
  trees_dir = os.path.join(base_dir, TREES_DIRNAME)
@@ -57,16 +61,24 @@ class AliSimDatasetGenerator(DatasetGenerator):
57
61
  tree_filename = f"{filename}.temp-tree"
58
62
  msa_filename = filename
59
63
 
60
- tree = self.trees.simulate_one(rng, data)
61
- if tree is None:
62
- return
64
+ d: dict[str, Any] = {"file_id": Path(msa_filename).stem}
65
+ rng = default_rng(seed)
66
+ while True:
67
+ d.update(data(context, rng))
68
+ try:
69
+ tree = self.trees.simulate_one(d, seed)
70
+ break
71
+ except TimeoutError:
72
+ print(
73
+ "Tree simulation timed out, retrying with different parameters..."
74
+ )
63
75
 
64
76
  for leaf in tree.get_leaves():
65
77
  leaf.id += f"|{leaf.get_time()}"
66
78
  dump_newick(tree, f"{tree_filename}.nwk")
67
79
 
68
- self._generate_one_from_tree(
69
- filename=msa_filename, tree_file=f"{tree_filename}.nwk", rng=rng, data=data
70
- )
80
+ self._generate_one_from_tree(msa_filename, f"{tree_filename}.nwk", rng, d)
71
81
  if not self.keep_trees:
72
82
  os.remove(f"{tree_filename}.nwk")
83
+
84
+ return d
@@ -1,16 +1,13 @@
1
1
  import os
2
2
  from abc import ABC, abstractmethod
3
3
  from enum import Enum
4
- from itertools import product
5
4
  from typing import Any
6
5
 
7
6
  import joblib
8
- import numpy as np
9
7
  import pandas as pd
10
8
  from numpy.random import Generator, default_rng
11
9
  from tqdm import tqdm
12
10
 
13
- from phylogenie.generators.factories import distribution
14
11
  from phylogenie.utils import Distribution, StrictBaseModel
15
12
 
16
13
 
@@ -31,15 +28,12 @@ class DatasetGenerator(ABC, StrictBaseModel):
31
28
  context: dict[str, Distribution] | None = None
32
29
 
33
30
  @abstractmethod
34
- def _generate_one(
35
- self, filename: str, rng: Generator, data: dict[str, Any]
36
- ) -> None: ...
37
-
38
31
  def generate_one(
39
- self, filename: str, data: dict[str, Any] | None = None, seed: int | None = None
40
- ) -> None:
41
- data = {} if data is None else data
42
- self._generate_one(filename=filename, rng=default_rng(seed), data=data)
32
+ self,
33
+ filename: str,
34
+ context: dict[str, Any] | None = None,
35
+ seed: int | None = None,
36
+ ) -> dict[str, Any]: ...
43
37
 
44
38
  def _generate(self, rng: Generator, n_samples: int, output_dir: str) -> None:
45
39
  if os.path.exists(output_dir):
@@ -53,24 +47,18 @@ class DatasetGenerator(ABC, StrictBaseModel):
53
47
  )
54
48
  os.makedirs(data_dir)
55
49
 
56
- data: list[dict[str, Any]] = [{} for _ in range(n_samples)]
57
- if self.context is not None:
58
- for d, (k, v) in product(data, self.context.items()):
59
- dist = distribution(v, d)
60
- d[k] = np.array(getattr(rng, dist.type)(**dist.args)).tolist()
61
- df = pd.DataFrame([{"file_id": str(i), **d} for i, d in enumerate(data)])
62
- df.to_csv(os.path.join(output_dir, METADATA_FILENAME), index=False)
63
-
64
50
  jobs = joblib.Parallel(n_jobs=self.n_jobs, return_as="generator_unordered")(
65
51
  joblib.delayed(self.generate_one)(
66
- filename=os.path.join(data_dir, str(i)),
67
- data=data[i],
68
52
  seed=int(rng.integers(2**32)),
53
+ filename=os.path.join(data_dir, str(i)),
54
+ context=self.context,
69
55
  )
70
56
  for i in range(n_samples)
71
57
  )
72
- for _ in tqdm(jobs, total=n_samples, desc=f"Generating {data_dir}..."):
73
- pass
58
+ df = pd.DataFrame(
59
+ [r for r in tqdm(jobs, total=n_samples, desc=f"Generating {data_dir}...")]
60
+ )
61
+ df.to_csv(os.path.join(output_dir, METADATA_FILENAME), index=False)
74
62
 
75
63
  def generate(self) -> None:
76
64
  rng = default_rng(self.seed)
@@ -1,6 +1,7 @@
1
1
  from typing import Any
2
2
 
3
3
  import numpy as np
4
+ from numpy.random import Generator
4
5
 
5
6
  import phylogenie.generators.configs as cfg
6
7
  import phylogenie.generators.typeguards as ctg
@@ -30,14 +31,6 @@ def _eval_expression(expression: str, data: dict[str, Any]) -> Any:
30
31
  ).tolist()
31
32
 
32
33
 
33
- def distribution(x: Distribution, data: dict[str, Any]) -> Distribution:
34
- args = x.args
35
- for arg_name, arg_value in args.items():
36
- if isinstance(arg_value, str):
37
- args[arg_name] = _eval_expression(arg_value, data)
38
- return Distribution(type=x.type, **args)
39
-
40
-
41
34
  def integer(x: cfg.Integer, data: dict[str, Any]) -> int:
42
35
  if isinstance(x, str):
43
36
  e = _eval_expression(x, data)
@@ -209,3 +202,21 @@ def skyline_matrix(
209
202
  value = [[[e] * N] * M if isinstance(e, pgt.Scalar) else e for e in value]
210
203
 
211
204
  return SkylineMatrix(value=value, change_times=change_times)
205
+
206
+
207
+ def distribution(x: Distribution, data: dict[str, Any]) -> Distribution:
208
+ args = x.args
209
+ for arg_name, arg_value in args.items():
210
+ if isinstance(arg_value, str):
211
+ args[arg_name] = _eval_expression(arg_value, data)
212
+ return Distribution(type=x.type, **args)
213
+
214
+
215
+ def data(context: dict[str, Distribution] | None, rng: Generator) -> dict[str, Any]:
216
+ if context is None:
217
+ return {}
218
+ data: dict[str, Any] = {}
219
+ for k, v in context.items():
220
+ dist = distribution(v, data)
221
+ data[k] = np.array(getattr(rng, dist.type)(**dist.args)).tolist()
222
+ return data
@@ -1,14 +1,16 @@
1
1
  from abc import abstractmethod
2
2
  from enum import Enum
3
+ from pathlib import Path
3
4
  from typing import Annotated, Any, Literal
4
5
 
5
6
  import numpy as np
6
- from numpy.random import Generator
7
+ from numpy.random import default_rng
7
8
  from pydantic import Field
8
9
 
9
10
  import phylogenie.generators.configs as cfg
10
11
  from phylogenie.generators.dataset import DatasetGenerator, DataType
11
12
  from phylogenie.generators.factories import (
13
+ data,
12
14
  distribution,
13
15
  integer,
14
16
  scalar,
@@ -48,19 +50,19 @@ class TreeDatasetGenerator(DatasetGenerator):
48
50
  max_time: cfg.Scalar = np.inf
49
51
  init_state: str | None = None
50
52
  sampling_probability_at_present: cfg.Scalar = 0.0
53
+ timeout: float = np.inf
51
54
 
52
55
  @abstractmethod
53
56
  def _get_events(self, data: dict[str, Any]) -> list[Event]: ...
54
57
 
55
- def simulate_one(self, rng: Generator, data: dict[str, Any]) -> Tree | None:
56
- events = self._get_events(data)
58
+ def simulate_one(self, data: dict[str, Any], seed: int | None = None) -> Tree:
57
59
  init_state = (
58
60
  self.init_state
59
61
  if self.init_state is None
60
62
  else self.init_state.format(**data)
61
63
  )
62
64
  return simulate_tree(
63
- events=events,
65
+ events=self._get_events(data),
64
66
  min_tips=integer(self.min_tips, data),
65
67
  max_tips=integer(self.max_tips, data),
66
68
  max_time=scalar(self.max_time, data),
@@ -68,15 +70,27 @@ class TreeDatasetGenerator(DatasetGenerator):
68
70
  sampling_probability_at_present=scalar(
69
71
  self.sampling_probability_at_present, data
70
72
  ),
71
- seed=int(rng.integers(2**32)),
73
+ seed=seed,
74
+ timeout=self.timeout,
72
75
  )
73
76
 
74
- def _generate_one(
75
- self, filename: str, rng: Generator, data: dict[str, Any]
76
- ) -> None:
77
- tree = self.simulate_one(rng, data)
78
- if tree is not None:
79
- dump_newick(tree, f"{filename}.nwk")
77
+ def generate_one(
78
+ self,
79
+ filename: str,
80
+ context: dict[str, Any] | None = None,
81
+ seed: int | None = None,
82
+ ) -> dict[str, Any]:
83
+ d = {"file_id": Path(filename).stem}
84
+ rng = default_rng(seed)
85
+ while True:
86
+ try:
87
+ d.update(data(context, rng))
88
+ tree = self.simulate_one(d, seed)
89
+ dump_newick(tree, f"{filename}.nwk")
90
+ break
91
+ except TimeoutError:
92
+ print("Simulation timed out, retrying with different parameters...")
93
+ return d
80
94
 
81
95
 
82
96
  class CanonicalTreeDatasetGenerator(TreeDatasetGenerator):
@@ -1,4 +1,5 @@
1
1
  import os
2
+ import time
2
3
  from collections.abc import Sequence
3
4
 
4
5
  import joblib
@@ -21,7 +22,8 @@ def simulate_tree(
21
22
  init_state: str | None = None,
22
23
  sampling_probability_at_present: float = 0.0,
23
24
  seed: int | None = None,
24
- ) -> Tree | None:
25
+ timeout: float = np.inf,
26
+ ) -> Tree:
25
27
  if max_time == np.inf and max_tips == MAX_TIPS:
26
28
  raise ValueError("Either max_time or max_tips must be specified.")
27
29
 
@@ -41,14 +43,18 @@ def simulate_tree(
41
43
  raise ValueError(f"Init state {init_state} not found in event states: {states}")
42
44
 
43
45
  rng = default_rng(seed)
46
+ start_clock = time.perf_counter()
44
47
  while True:
45
48
  model = Model(init_state, events)
46
49
  current_time = 0.0
47
50
  change_times = sorted(set(t for e in events for t in e.rate.change_times))
48
51
  next_change_time = change_times.pop(0) if change_times else np.inf
49
-
50
52
  target_n_tips = rng.integers(min_tips, max_tips) if max_time == np.inf else None
53
+
51
54
  while current_time < max_time:
55
+ if time.perf_counter() - start_clock > timeout:
56
+ raise TimeoutError("Simulation timed out.")
57
+
52
58
  events = model.events
53
59
  rates = [e.get_propensity(model, current_time) for e in events]
54
60
 
@@ -98,26 +104,35 @@ def generate_trees(
98
104
  sampling_probability_at_present: float = 0.0,
99
105
  seed: int | None = None,
100
106
  n_jobs: int = -1,
107
+ timeout: float = np.inf,
101
108
  ) -> None:
109
+ def _simulate_tree(seed: int) -> Tree:
110
+ while True:
111
+ try:
112
+ return simulate_tree(
113
+ events=events,
114
+ min_tips=min_tips,
115
+ max_tips=max_tips,
116
+ max_time=max_time,
117
+ init_state=init_state,
118
+ sampling_probability_at_present=sampling_probability_at_present,
119
+ seed=seed,
120
+ timeout=timeout,
121
+ )
122
+ except TimeoutError:
123
+ print("Simulation timed out, retrying with a different seed...")
124
+ seed += 1
125
+
102
126
  if os.path.exists(output_dir):
103
127
  raise FileExistsError(f"Output directory {output_dir} already exists")
104
128
  os.mkdir(output_dir)
105
129
 
106
130
  rng = default_rng(seed)
107
131
  jobs = joblib.Parallel(n_jobs=n_jobs, return_as="generator_unordered")(
108
- joblib.delayed(simulate_tree)(
109
- events=events,
110
- min_tips=min_tips,
111
- max_tips=max_tips,
112
- max_time=max_time,
113
- init_state=init_state,
114
- sampling_probability_at_present=sampling_probability_at_present,
115
- seed=int(rng.integers(2**32)),
116
- )
132
+ joblib.delayed(_simulate_tree)(seed=int(rng.integers(2**32)))
117
133
  for _ in range(n_trees)
118
134
  )
119
135
  for i, tree in tqdm(
120
136
  enumerate(jobs), total=n_trees, desc=f"Generating trees in {output_dir}..."
121
137
  ):
122
- if tree is not None:
123
- dump_newick(tree, os.path.join(output_dir, f"{i}.nwk"))
138
+ dump_newick(tree, os.path.join(output_dir, f"{i}.nwk"))
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "phylogenie"
3
- version = "2.1.2"
3
+ version = "2.1.3"
4
4
  description = "Generate phylogenetic datasets with minimal setup effort"
5
5
  authors = ["Gabriele Marino <gabmarino.8601@gmail.com>"]
6
6
  readme = "README.md"
File without changes
File without changes
File without changes
File without changes