msasim 2024.5.22__tar.gz → 2024.5.2719__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,54 @@
1
+ Metadata-Version: 2.1
2
+ Name: msasim
3
+ Version: 2024.5.2719
4
+ Summary: A fast MSA simulator
5
+ Home-page: https://github.com/elyawy/Sailfish-backend
6
+ Author: Elya Wygoda
7
+ Author-email: elya.wygoda@gmail.com
8
+ Requires-Python: >=3.6
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE
11
+ Provides-Extra: test
12
+ Requires-Dist: pytest; extra == "test"
13
+
14
+ # Sailfish
15
+
16
+ Sailfish is a performant multiple sequence alignment(MSA) simulator, written in C++ and Python, allowing for quick and easy generation of large simulated datasets.
17
+
18
+ ## Project goals
19
+
20
+ - Ease of use
21
+ - Speed
22
+ - Modularity
23
+
24
+ ## Installation
25
+
26
+ ```bash
27
+ pip install msasim
28
+ ```
29
+
30
+ ## Example
31
+
32
+ ```python
33
+ from msasim import sailfish as sim
34
+ from msasim.sailfish import MODEL_CODES, ZipfDistribution
35
+
36
+ ROOT_SEQUENCE_LENGTH = 100
37
+
38
+ sim_protocol = sim.SimProtocol("(A:0.5,B:0.5);",
39
+ deletion_rate=0.01,
40
+ insertion_rate=0.01,
41
+ deletion_dist=ZipfDistribution(1.08, 50),
42
+ insertion_dist=ZipfDistribution(1.08, 50),
43
+ seed=50)
44
+ sim_protocol.set_sequence_size(ROOT_SEQUENCE_LENGTH)
45
+
46
+ simulation = sim.Simulator(sim_protocol, simulation_type=sim.SIMULATION_TYPE.PROTEIN)
47
+
48
+ simulation.set_replacement_model(model=MODEL_CODES.WAG,
49
+ gamma_parameters_alpha=1.0,
50
+ gamma_parameters_catergories=4)
51
+ msa = simulation()
52
+ msa.print_msa()
53
+
54
+ ```
@@ -0,0 +1,41 @@
1
+ # Sailfish
2
+
3
+ Sailfish is a performant multiple sequence alignment(MSA) simulator, written in C++ and Python, allowing for quick and easy generation of large simulated datasets.
4
+
5
+ ## Project goals
6
+
7
+ - Ease of use
8
+ - Speed
9
+ - Modularity
10
+
11
+ ## Installation
12
+
13
+ ```bash
14
+ pip install msasim
15
+ ```
16
+
17
+ ## Example
18
+
19
+ ```python
20
+ from msasim import sailfish as sim
21
+ from msasim.sailfish import MODEL_CODES, ZipfDistribution
22
+
23
+ ROOT_SEQUENCE_LENGTH = 100
24
+
25
+ sim_protocol = sim.SimProtocol("(A:0.5,B:0.5);",
26
+ deletion_rate=0.01,
27
+ insertion_rate=0.01,
28
+ deletion_dist=ZipfDistribution(1.08, 50),
29
+ insertion_dist=ZipfDistribution(1.08, 50),
30
+ seed=50)
31
+ sim_protocol.set_sequence_size(ROOT_SEQUENCE_LENGTH)
32
+
33
+ simulation = sim.Simulator(sim_protocol, simulation_type=sim.SIMULATION_TYPE.PROTEIN)
34
+
35
+ simulation.set_replacement_model(model=MODEL_CODES.WAG,
36
+ gamma_parameters_alpha=1.0,
37
+ gamma_parameters_catergories=4)
38
+ msa = simulation()
39
+ msa.print_msa()
40
+
41
+ ```
@@ -368,8 +368,8 @@ class Msa:
368
368
  '''
369
369
  The MSA class from the simulator
370
370
  '''
371
- def __init__(self, species_dict: Dict[str, BlockTree], root_node):
372
- self._msa = _Sailfish.Msa(species_dict, root_node)
371
+ def __init__(self, species_dict: Dict[str, BlockTree], root_node, save_list: List[bool]):
372
+ self._msa = _Sailfish.Msa(species_dict, root_node, save_list)
373
373
 
374
374
  def generate_msas(self, node):
375
375
  self._msa.generate_msas(node)
@@ -504,7 +504,15 @@ class Simulator:
504
504
  def gen_indels(self) -> BlockTreePython:
505
505
  return BlockTreePython(self._simulator.gen_indels())
506
506
 
507
+ def get_sequences_to_save(self) -> List[bool]:
508
+ return self._simulator.get_saved_nodes_mask()
507
509
 
510
+ def save_root_sequence(self):
511
+ self._simulator.save_root_sequence()
512
+
513
+ def save_all_nodes_sequences(self):
514
+ self._simulator.save_all_nodes_sequences()
515
+
508
516
  def gen_substitutions(self, length: int):
509
517
  if not self._is_sub_model_init:
510
518
  self._init_sub_model()
@@ -515,10 +523,14 @@ class Simulator:
515
523
  Msas = []
516
524
  for _ in range(times):
517
525
  if self._simProtocol._is_insertion_rate_zero and self._simProtocol._is_deletion_rate_zero:
518
- msa = Msa(self._simProtocol.get_tree().get_num_leaves(), self._simProtocol.get_sequence_size())
526
+ msa = Msa(sum(self.get_sequences_to_save()),
527
+ self._simProtocol.get_sequence_size(),
528
+ self.get_sequences_to_save())
519
529
  else:
520
530
  blocktree = self.gen_indels()
521
- msa = Msa(blocktree._get_Sailfish_blocks(), self._simProtocol._get_root())
531
+ msa = Msa(blocktree._get_Sailfish_blocks(),
532
+ self._simProtocol._get_root(),
533
+ self.get_sequences_to_save())
522
534
 
523
535
  # sim.init_substitution_sim(mFac)
524
536
  substitutions = self.gen_substitutions(msa.get_length())
@@ -0,0 +1,54 @@
1
+ Metadata-Version: 2.1
2
+ Name: msasim
3
+ Version: 2024.5.2719
4
+ Summary: A fast MSA simulator
5
+ Home-page: https://github.com/elyawy/Sailfish-backend
6
+ Author: Elya Wygoda
7
+ Author-email: elya.wygoda@gmail.com
8
+ Requires-Python: >=3.6
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE
11
+ Provides-Extra: test
12
+ Requires-Dist: pytest; extra == "test"
13
+
14
+ # Sailfish
15
+
16
+ Sailfish is a performant multiple sequence alignment(MSA) simulator, written in C++ and Python, allowing for quick and easy generation of large simulated datasets.
17
+
18
+ ## Project goals
19
+
20
+ - Ease of use
21
+ - Speed
22
+ - Modularity
23
+
24
+ ## Installation
25
+
26
+ ```bash
27
+ pip install msasim
28
+ ```
29
+
30
+ ## Example
31
+
32
+ ```python
33
+ from msasim import sailfish as sim
34
+ from msasim.sailfish import MODEL_CODES, ZipfDistribution
35
+
36
+ ROOT_SEQUENCE_LENGTH = 100
37
+
38
+ sim_protocol = sim.SimProtocol("(A:0.5,B:0.5);",
39
+ deletion_rate=0.01,
40
+ insertion_rate=0.01,
41
+ deletion_dist=ZipfDistribution(1.08, 50),
42
+ insertion_dist=ZipfDistribution(1.08, 50),
43
+ seed=50)
44
+ sim_protocol.set_sequence_size(ROOT_SEQUENCE_LENGTH)
45
+
46
+ simulation = sim.Simulator(sim_protocol, simulation_type=sim.SIMULATION_TYPE.PROTEIN)
47
+
48
+ simulation.set_replacement_model(model=MODEL_CODES.WAG,
49
+ gamma_parameters_alpha=1.0,
50
+ gamma_parameters_catergories=4)
51
+ msa = simulation()
52
+ msa.print_msa()
53
+
54
+ ```
@@ -2,8 +2,8 @@ LICENSE
2
2
  README.md
3
3
  pyproject.toml
4
4
  setup.py
5
- Sailfish/__init__.py
6
- Sailfish/simulator.py
5
+ msasim/__init__.py
6
+ msasim/sailfish.py
7
7
  msasim.egg-info/PKG-INFO
8
8
  msasim.egg-info/SOURCES.txt
9
9
  msasim.egg-info/dependency_links.txt
@@ -1,2 +1,2 @@
1
- Sailfish
2
1
  _Sailfish
2
+ msasim
@@ -6,9 +6,9 @@ from setuptools import setup, find_packages
6
6
 
7
7
  from datetime import datetime
8
8
 
9
- today = datetime.today()
9
+ now = datetime.now()
10
10
 
11
- __version__ = f"{today.year}.{today.month}.{today.day}"
11
+ __version__ = f"{now.year}.{now.month}.{now.day}{now.hour}"
12
12
 
13
13
  # The main interface is through Pybind11Extension.
14
14
  # * You can add cxx_std=11/14/17, and then build_ext can be removed.
@@ -28,7 +28,7 @@ def print_sources(sources):
28
28
  ext_modules = [
29
29
  Pybind11Extension("_Sailfish",
30
30
  sources = sorted(print_sources(glob("src/*.cpp"))),
31
- cxx_std = "14",
31
+ cxx_std = "17",
32
32
  extra_objects=[str(x) for x in Path(".").resolve().glob("libs/*") if x.is_file()],
33
33
  # extra_compile_args=["-g"],
34
34
  # Example: passing in the version to the compiled code
@@ -44,11 +44,13 @@ setup(
44
44
  author_email="elya.wygoda@gmail.com",
45
45
  url="https://github.com/elyawy/Sailfish-backend",
46
46
  description="A fast MSA simulator",
47
- long_description="Sailfish is a performant multiple sequence alignment simulator, written in C++, allowing fast generation of large simualted datasets.",
47
+ # long_description="Sailfish is a performant multiple sequence alignment simulator, written in C++, allowing fast generation of large simualted datasets.",
48
+ long_description=open("README.md", 'r').read(),
49
+ long_description_content_type='text/markdown',
48
50
  ext_modules=ext_modules,
49
51
  extras_require={"test": "pytest"},
50
52
  cmdclass={"build_ext": build_ext},
51
53
  zip_safe=False,
52
54
  python_requires=">=3.6",
53
- packages=find_packages(include=['Sailfish','tests'])
55
+ packages=find_packages(include=['msasim','tests'])
54
56
  )
@@ -133,12 +133,15 @@ PYBIND11_MODULE(_Sailfish, m) {
133
133
  .def("init_substitution_sim", &Simulator::initSubstitionSim)
134
134
  .def("gen_substitutions", &Simulator::simulateSubstitutions)
135
135
  .def("save_site_rates", &Simulator::setSaveRates)
136
- .def("get_site_rates", &Simulator::getSiteRates);
136
+ .def("get_site_rates", &Simulator::getSiteRates)
137
+ .def("save_all_nodes_sequences", &Simulator::setSaveAllNodes)
138
+ .def("save_root_sequence", &Simulator::setSaveRoot)
139
+ .def("get_saved_nodes_mask", &Simulator::getNodesSaveList);
137
140
 
138
141
 
139
142
  py::class_<MSA>(m, "Msa")
140
- .def(py::init<size_t, size_t>())
141
- .def(py::init<BlockMap, tree::TreeNode*>())
143
+ .def(py::init<size_t, size_t, const std::vector<bool>&>())
144
+ .def(py::init<BlockMap, tree::TreeNode*, const std::vector<bool>&>())
142
145
  .def("generate_msas", &MSA::generateMSAs)
143
146
  .def("length", &MSA::getMSAlength)
144
147
  .def("num_sequences", &MSA::getNumberOfSequences)
@@ -115,12 +115,11 @@ void rateMatrixSim::generate_substitution_log(int seqLength) {
115
115
  sumOfRatesAcrossSites += ratesVec[h];
116
116
  }
117
117
  if (_saveRates) _siteRates.insert(_siteRates.end(), ratesVec.begin(), ratesVec.end());
118
- MDOUBLE sumOfRatesNoramlizingFactor = 1.0 / sumOfRatesAcrossSites;
119
-
120
- _siteSampler = std::make_unique<DiscreteDistribution>(ratesVec, sumOfRatesNoramlizingFactor);
118
+ // MDOUBLE sumOfRatesNoramlizingFactor = 1.0 / sumOfRatesAcrossSites;
121
119
 
120
+ // _siteSampler = std::make_unique<DiscreteDistribution>(ratesVec, sumOfRatesNoramlizingFactor);
122
121
  _rootSequence.resize(seqLength);
123
- generateRootSeq(seqLength);
122
+ generateRootSeq(seqLength, ratesVec);
124
123
  if (_nodesToSave[_et->getRoot()->id()]) saveSequence(_et->getRoot()->id(), _et->getRoot()->name());
125
124
 
126
125
  mutateSeqRecuresively(_et->getRoot(), seqLength);
@@ -128,13 +127,13 @@ void rateMatrixSim::generate_substitution_log(int seqLength) {
128
127
  }
129
128
 
130
129
  void rateMatrixSim::mutateSeqRecuresively(tree::nodeP currentNode, int seqLength) {
130
+ if (currentNode->isLeaf()) return;
131
131
 
132
132
  for (auto &node: currentNode->getSons()) {
133
133
  mutateSeqAlongBranch(node, seqLength);
134
- mutateSeqRecuresively(node, seqLength);
135
134
  if (_nodesToSave[node->id()]) saveSequence(node->id(), node->name());
135
+ mutateSeqRecuresively(node, seqLength);
136
136
 
137
- // std::cout << "Node: " << currentNode->id() << "\n";
138
137
  if (!_subManager.isEmpty(currentNode->id())) {
139
138
  _subManager.undoSubs(currentNode->id(), _rootSequence, _rateCategories, _sp.get());
140
139
  }
@@ -186,7 +185,7 @@ void rateMatrixSim::mutateSeqGillespie(tree::nodeP currentNode, int seqLength, M
186
185
  errorMsg::reportError("waiting time is negative :(");
187
186
  }
188
187
 
189
- int mutatedSite = _siteSampler->drawSample() - 1;
188
+ int mutatedSite = _subManager.sampleSite(*_mt_rand);
190
189
  ALPHACHAR parentChar = _rootSequence[mutatedSite];
191
190
  ALPHACHAR nextChar = _gillespieSampler[parentChar]->drawSample() - 1;
192
191
  // std::cout << (int)parentChar << "->" << (int)nextChar << "\n";
@@ -203,16 +202,17 @@ void rateMatrixSim::mutateSeqGillespie(tree::nodeP currentNode, int seqLength, M
203
202
 
204
203
 
205
204
 
206
- void rateMatrixSim::generateRootSeq(int seqLength) {
205
+ void rateMatrixSim::generateRootSeq(int seqLength, std::vector<MDOUBLE>& ratesVec) {
207
206
  size_t rootID = _et->getRoot()->id();
208
207
  for (int i = 0; i < seqLength; i++) {
209
208
  ALPHACHAR newChar = _frequencySampler->drawSample() - 1;
209
+ // ratesVec[i] = ratesVec[i]*(-_sp->Qij(newChar, newChar));
210
210
  _rootSequence[i] = newChar;
211
211
  }
212
212
  // std::cout << ">Root-sequence\n" << _rootSequence << "\n";
213
- _subManager.handleRootSequence(seqLength, _rateCategories, _sp.get(), _rootSequence);
214
-
215
-
213
+ // std::cout << ">Rates\n" << ratesVec;
214
+ _subManager.handleRootSequence(seqLength, ratesVec, _sp.get(), _rootSequence);
215
+
216
216
  _rootSequence.setAlphabet(_alph);
217
217
  _rootSequence.setName(_et->getRoot()->name());
218
218
  _rootSequence.setID(_et->getRoot()->id());
@@ -223,6 +223,7 @@ void rateMatrixSim::saveSequence(const int &nodeId,const std::string &name) {
223
223
  sequence temp(_rootSequence);
224
224
  temp.setName(name);
225
225
  temp.setID(nodeId);
226
+ // std::cout << temp << "\n";
226
227
  _simulatedSequences->add(temp);
227
228
  }
228
229
 
@@ -252,13 +253,36 @@ std::unique_ptr<sequenceContainer> rateMatrixSim::getSequenceContainer() {
252
253
  return std::move(outputSequences);
253
254
  }
254
255
 
255
- void rateMatrixSim::setNodesToSaves(std::vector<size_t> nodeIDs) {
256
+ void rateMatrixSim::setNodesToSave(std::vector<size_t> nodeIDs) {
256
257
  std::fill(_nodesToSave.begin(), _nodesToSave.end(), false);
257
258
  for(auto &nodeID: nodeIDs) {
258
259
  _nodesToSave[nodeID] = true;
259
260
  }
260
261
  }
261
262
 
263
+ void rateMatrixSim::setSaveAllNodes() {
264
+ for (size_t i = 0; i < _nodesToSave.size(); i++) {
265
+ _nodesToSave[i] = true;
266
+ }
267
+ }
268
+
269
+ void rateMatrixSim::setSaveRoot() {
270
+ _nodesToSave[0] = true;
271
+ }
272
+
273
+
274
+ void rateMatrixSim::changeNodeSaveState(size_t nodeID) {
275
+ _nodesToSave[nodeID] = !_nodesToSave[nodeID];
276
+ }
277
+
278
+ bool rateMatrixSim::getNodeSaveState(size_t nodeID) {
279
+ return _nodesToSave[nodeID];
280
+ }
281
+
282
+ const std::vector<bool>& rateMatrixSim::getNodesSaveList() {
283
+ return _nodesToSave;
284
+ }
285
+
262
286
  bool rateMatrixSim::testSumOfRates() {
263
287
  MDOUBLE sumOfRates = 0.0;
264
288
  for (size_t i = 0; i < _rootSequence.seqLen(); i++) {
msasim-2024.5.22/PKG-INFO DELETED
@@ -1,13 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: msasim
3
- Version: 2024.5.22
4
- Summary: A fast MSA simulator
5
- Home-page: https://github.com/elyawy/Sailfish-backend
6
- Author: Elya Wygoda
7
- Author-email: elya.wygoda@gmail.com
8
- Requires-Python: >=3.6
9
- License-File: LICENSE
10
- Provides-Extra: test
11
- Requires-Dist: pytest; extra == "test"
12
-
13
- Sailfish is a performant multiple sequence alignment simulator, written in C++, allowing fast generation of large simualted datasets.
@@ -1,2 +0,0 @@
1
- # Sailfish-backend
2
- Backend C++ code for the Sailfish MSA simulation library using the Pybind11 framework
@@ -1,13 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: msasim
3
- Version: 2024.5.22
4
- Summary: A fast MSA simulator
5
- Home-page: https://github.com/elyawy/Sailfish-backend
6
- Author: Elya Wygoda
7
- Author-email: elya.wygoda@gmail.com
8
- Requires-Python: >=3.6
9
- License-File: LICENSE
10
- Provides-Extra: test
11
- Requires-Dist: pytest; extra == "test"
12
-
13
- Sailfish is a performant multiple sequence alignment simulator, written in C++, allowing fast generation of large simualted datasets.
File without changes
File without changes
File without changes