msasim 25.10.15__tar.gz → 25.11.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: msasim
3
- Version: 25.10.15
3
+ Version: 25.11.1
4
4
  Summary: A fast MSA simulator
5
5
  Home-page: https://github.com/elyawy/Sailfish-backend
6
6
  Author: Elya Wygoda
@@ -376,6 +376,8 @@ class modelFactory:
376
376
  ...
377
377
  def set_invariant_sites_proportion(self, arg0: float) -> None:
378
378
  ...
379
+ def set_site_rate_correlation(self, arg0: float) -> None:
380
+ ...
379
381
  def set_model_parameters(self, arg0: list[float]) -> None:
380
382
  ...
381
383
  def set_replacement_model(self, arg0: modelCode) -> None:
@@ -25,14 +25,14 @@ class Distribution:
25
25
  raise ValueError(f"Each value of the probabilities should be between 0 to 1. Received a value of {x}")
26
26
  self._dist = _Sailfish.DiscreteDistribution(dist)
27
27
 
28
- def draw_sample(self) -> int:
29
- return self._dist.draw_sample()
28
+ # def draw_sample(self) -> int:
29
+ # return self._dist.draw_sample()
30
30
 
31
- def set_seed(self, seed: int) -> None:
32
- return self._dist.set_seed(seed)
31
+ # def set_seed(self, seed: int) -> None:
32
+ # return self._dist.set_seed(seed)
33
33
 
34
- def get_table(self) -> List:
35
- return self._dist.get_table()
34
+ # def get_table(self) -> List:
35
+ # return self._dist.get_table()
36
36
 
37
37
  def _get_Sailfish_dist(self) -> _Sailfish.DiscreteDistribution:
38
38
  return self._dist
@@ -475,7 +475,8 @@ class Simulator:
475
475
  model_parameters: List = None,
476
476
  gamma_parameters_alpha : float = 1.0,
477
477
  gamma_parameters_categories: int = 1,
478
- invariant_sites_proportion: float = 0.0
478
+ invariant_sites_proportion: float = 0.0,
479
+ site_rate_correlation: float = 0.0,
479
480
  ) -> None:
480
481
  if not model:
481
482
  raise ValueError(f"please provide a substitution model from the the following list: {_Sailfish.modelCode}")
@@ -503,6 +504,8 @@ class Simulator:
503
504
 
504
505
  self._model_factory.set_gamma_parameters(gamma_parameters_alpha, gamma_parameters_categories)
505
506
  self._model_factory.set_invariant_sites_proportion(invariant_sites_proportion)
507
+ self._model_factory.set_site_rate_correlation(site_rate_correlation)
508
+
506
509
  self._simulator.init_substitution_sim(self._model_factory)
507
510
 
508
511
  self._is_sub_model_init = True
@@ -556,13 +559,11 @@ class Simulator:
556
559
  msa = Msa(blocktree._get_Sailfish_blocks(),
557
560
  self._simProtocol._get_root(),
558
561
  self.get_sequences_to_save())
562
+ self._simulator.set_aligned_sequence_map(msa._msa)
559
563
 
560
564
  # sim.init_substitution_sim(mFac)
561
565
  if self._simulation_type != SIMULATION_TYPE.NOSUBS:
562
- with tempfile.TemporaryDirectory() as tmpdirname:
563
- self._simulator.gen_substitutions_to_dir(msa.get_length(), tmpdirname)
564
- msa._msa.set_substitutions_folder(tmpdirname)
565
- msa._msa.write_msa_from_dir(str(output_file_path))
566
+ self._simulator.gen_substitutions_to_file(msa.get_length(), str(output_file_path))
566
567
 
567
568
 
568
569
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: msasim
3
- Version: 25.10.15
3
+ Version: 25.11.1
4
4
  Summary: A fast MSA simulator
5
5
  Home-page: https://github.com/elyawy/Sailfish-backend
6
6
  Author: Elya Wygoda
@@ -13,5 +13,4 @@ msasim.egg-info/not-zip-safe
13
13
  msasim.egg-info/requires.txt
14
14
  msasim.egg-info/top_level.txt
15
15
  src/main.cpp
16
- src/modelFactory.cpp
17
- src/rateMatrixSim.cpp
16
+ src/modelFactory.cpp
@@ -6,7 +6,7 @@ from setuptools import setup, find_packages
6
6
 
7
7
 
8
8
 
9
- __version__ = "25.10.15"
9
+ __version__ = "25.11.1"
10
10
 
11
11
  # The main interface is through Pybind11Extension.
12
12
  # * You can add cxx_std=11/14/17, and then build_ext can be removed.
@@ -4,6 +4,7 @@
4
4
  #include <pybind11/stl.h>
5
5
  #include <memory>
6
6
 
7
+ #include "../libs/pcg/pcg_random.hpp"
7
8
  #include "./Simulator.h"
8
9
 
9
10
  namespace py = pybind11;
@@ -29,6 +30,8 @@ PYBIND11_MODULE(_Sailfish, m) {
29
30
  Tree
30
31
  )pbdoc";
31
32
 
33
+ using SelectedRNG = pcg64;
34
+
32
35
  py::class_<Block>(m, "Block")
33
36
  .def(py::init<size_t, size_t>());
34
37
 
@@ -43,10 +46,7 @@ PYBIND11_MODULE(_Sailfish, m) {
43
46
  .export_values();
44
47
 
45
48
  py::class_<DiscreteDistribution>(m, "DiscreteDistribution")
46
- .def(py::init<std::vector<double>>())
47
- .def("draw_sample", &DiscreteDistribution::drawSample, "Draw a random sample according to the given distribution")
48
- .def_static("set_seed", &DiscreteDistribution::setSeed, "Set seed for the random number generator")
49
- .def("get_table", &DiscreteDistribution::getTable, "Get Vose's alias table (useful for debugging)");
49
+ .def(py::init<std::vector<double>>());
50
50
 
51
51
  py::class_<tree>(m, "Tree")
52
52
  .def(py::init<const std::string&, bool>(), "Create Phylogenetic tree object from newick formatted file")
@@ -128,32 +128,33 @@ PYBIND11_MODULE(_Sailfish, m) {
128
128
  .def("set_model_parameters" , &modelFactory::setModelParameters)
129
129
  .def("set_gamma_parameters" , &modelFactory::setGammaParameters)
130
130
  .def("set_invariant_sites_proportion", &modelFactory::setInvariantSitesProportion)
131
+ .def("set_site_rate_correlation", &modelFactory::setSiteRateCorrelation)
131
132
  .def("reset", &modelFactory::resetFactory);
132
133
 
133
134
 
134
- py::class_<Simulator>(m, "Simulator")
135
+ py::class_<Simulator<SelectedRNG>>(m, "Simulator")
135
136
  .def(py::init<SimulationProtocol*>())
136
- .def("reset_sim", &Simulator::resetSimulator)
137
- .def("gen_indels", &Simulator::generateSimulation)
138
- .def("run_sim", &Simulator::runSimulator)
139
- .def("init_substitution_sim", &Simulator::initSubstitionSim)
140
- .def("gen_substitutions", &Simulator::simulateSubstitutions)
141
- .def("gen_substitutions_to_dir", &Simulator::simulateAndWriteSubstitutions)
142
- .def("save_site_rates", &Simulator::setSaveRates)
143
- .def("get_site_rates", &Simulator::getSiteRates)
144
- .def("save_all_nodes_sequences", &Simulator::setSaveAllNodes)
145
- .def("save_root_sequence", &Simulator::setSaveRoot)
146
- .def("get_saved_nodes_mask", &Simulator::getNodesSaveList);
137
+ .def("reset_sim", &Simulator<SelectedRNG>::resetSimulator)
138
+ .def("gen_indels", &Simulator<SelectedRNG>::generateSimulation)
139
+ .def("run_sim", &Simulator<SelectedRNG>::runSimulator)
140
+ .def("init_substitution_sim", &Simulator<SelectedRNG>::initSubstitionSim)
141
+ .def("gen_substitutions", &Simulator<SelectedRNG>::simulateSubstitutions)
142
+ .def("gen_substitutions_to_file", &Simulator<SelectedRNG>::simulateAndWriteSubstitutions)
143
+ .def("set_aligned_sequence_map", &Simulator<SelectedRNG>::setAlignedSequenceMap)
144
+ .def("save_site_rates", &Simulator<SelectedRNG>::setSaveRates)
145
+ .def("get_site_rates", &Simulator<SelectedRNG>::getSiteRates)
146
+ .def("save_all_nodes_sequences", &Simulator<SelectedRNG>::setSaveAllNodes)
147
+ .def("save_root_sequence", &Simulator<SelectedRNG>::setSaveRoot)
148
+ .def("get_saved_nodes_mask", &Simulator<SelectedRNG>::getNodesSaveList);
147
149
 
148
150
 
149
151
  py::class_<MSA>(m, "Msa")
150
152
  .def(py::init<size_t, size_t, const std::vector<bool>& >())
151
- .def(py::init<BlockMap, tree::TreeNode*, const std::vector<bool>& >())
153
+ .def(py::init<BlockMap&, tree::TreeNode*, const std::vector<bool>& >())
152
154
  .def("generate_msas", &MSA::generateMSAs)
153
155
  .def("length", &MSA::getMSAlength)
154
156
  .def("num_sequences", &MSA::getNumberOfSequences)
155
157
  .def("fill_substitutions", &MSA::fillSubstitutions)
156
- .def("set_substitutions_folder", &MSA::setSubstitutionsFolder)
157
158
  .def("print_msa", &MSA::printFullMsa)
158
159
  .def("print_indels", &MSA::printIndels)
159
160
  .def("write_msa", &MSA::writeFullMsa)
@@ -82,9 +82,17 @@ void modelFactory::setGammaParameters(MDOUBLE alpha, size_t numCategories) {
82
82
  _state = factoryState::COMPLETE;
83
83
  }
84
84
 
85
+ void modelFactory::setSiteRateCorrelation(MDOUBLE correlation) {
86
+ if (correlation < -1.0 || correlation > 1.0) {
87
+ errorMsg::reportError("Rate correlation must be between -1 and 1");
88
+ }
89
+ _siteRateCorrelation = correlation;
90
+ }
91
+
85
92
  void modelFactory::resetFactory() {
86
93
  _state = factoryState::ALPHABET;
87
94
  _invariantProportion = 0.0;
95
+ _siteRateCorrelation = 0.0;
88
96
  }
89
97
 
90
98
 
@@ -1,278 +0,0 @@
1
- // $Id: simulateTree.cpp 8508 2010-08-12 15:21:04Z rubi $
2
- #include <stack>
3
- #include <unordered_map>
4
- #include <ostream>
5
- #include <sstream>
6
-
7
-
8
- #include "../libs/Phylolib/includes/definitions.h"
9
- #include "../libs/Phylolib/includes/treeUtil.h"
10
- #include "../libs/Phylolib/includes/talRandom.h"
11
- #include "../libs/Phylolib/includes/gammaDistribution.h"
12
- #include "../libs/Phylolib/includes/codon.h"
13
-
14
- #include "rateMatrixSim.h"
15
- // simulateTree::simulateTree(tree* _inEt,
16
- // const stochasticProcess* sp,
17
- // const alphabet* alph) :
18
- // _et(_inEt), _sp(sp),_alph(alph),_avgSubtitutionsPerSite(0.0) {
19
- // };
20
-
21
- rateMatrixSim::rateMatrixSim(modelFactory& mFac, std::shared_ptr<std::vector<bool>> nodesToSave) :
22
- _et(mFac.getTree()), _sp(mFac.getStochasticProcess()), _alph(mFac.getAlphabet()),
23
- _invariantSitesProportion(mFac.getInvariantSitesProportion()),
24
- _cpijGam(), _rootSequence(mFac.getAlphabet()), _subManager(mFac.getTree()->getNodesNum()),
25
- _nodesToSave(nodesToSave), _saveRates(false), _biased_coin(0,1) {
26
- // _et = mFac.getTree();
27
- // _sp = mFac.getStochasticProcess();
28
- // _alph = mFac.getAlphabet();
29
-
30
- size_t alphaSize = _sp->alphabetSize();
31
-
32
- _cpijGam.fillPij(*_et, *_sp);
33
- initGillespieSampler();
34
-
35
-
36
- std::vector<MDOUBLE> rateProbs;
37
- for (int j = 0 ; j < _sp->categories(); ++j) {
38
- MDOUBLE currentRateProb = _sp->ratesProb(j);
39
- currentRateProb = currentRateProb * (1.0 - _invariantSitesProportion);
40
- rateProbs.push_back(currentRateProb);
41
- }
42
- if (_invariantSitesProportion > 0.0) rateProbs.push_back(_invariantSitesProportion);
43
-
44
- _rateSampler = std::make_unique<DiscreteDistribution>(rateProbs);
45
-
46
- std::vector<MDOUBLE> frequencies;
47
- for (int j = 0; j < alphaSize; ++j) {
48
- frequencies.push_back(_sp->freq(j));
49
- }
50
- _frequencySampler = std::make_unique<DiscreteDistribution>(frequencies);
51
-
52
- _simulatedSequences = std::make_unique<sequenceContainer>();
53
-
54
- };
55
-
56
- void rateMatrixSim::setSaveRates(bool saveRates) {
57
- _saveRates = saveRates;
58
- }
59
-
60
- void rateMatrixSim::initGillespieSampler() {
61
- _gillespieSampler.resize(_alph->size());
62
- for (size_t i = 0; i < _alph->size(); ++i) {
63
- std::vector<double> qRates(_alph->size(), 0.0);
64
- double sum = -_sp->Qij(i,i);
65
- double normalizer = 1.0 / sum;
66
- for (size_t j = 0; j < _alph->size(); ++j) {
67
- if (i==j) continue;
68
- qRates[j] = _sp->Qij(i,j) * normalizer;
69
- // std::cout << i << j << "->" << qRates[j] << ",";
70
- }
71
- // std::cout << "\n" << i << " " << sum << "\n";
72
- _gillespieSampler[i] = std::make_unique<DiscreteDistribution>(qRates);
73
- }
74
- }
75
-
76
- // simulateTree::simulateTree(const tree& _inEt,
77
- // const stochasticProcess& sp,
78
- // const alphabet* alph) : _sp(sp) {
79
- // _et = _inEt;
80
- // // _sp = sp;
81
- // _alph = alph;
82
- // _avgSubtitutionsPerSite = 0.0;
83
- // };
84
-
85
- rateMatrixSim::~rateMatrixSim() {
86
- }
87
-
88
- // void rateMatrixSim::setSeed(size_t seed) {
89
- // _seed = seed;
90
- // _mt_rand->seed(seed);
91
- // }
92
-
93
- void rateMatrixSim::setRng(mt19937_64 *rng) {
94
- _mt_rand = rng;
95
- }
96
-
97
-
98
- // const mt19937_64& rateMatrixSim::getRng(){
99
- // return *_mt_rand;
100
- // }
101
-
102
-
103
- void rateMatrixSim::generate_substitution_log(int seqLength) {
104
- std::vector<MDOUBLE> ratesVec(seqLength);
105
-
106
- MDOUBLE sumOfRatesAcrossSites = 0.0;
107
- _rateCategories.resize(seqLength);
108
- for (int h = 0; h < seqLength; h++) {
109
- int selectedRandomCategory = _rateSampler->drawSample() - 1;
110
- _rateCategories[h] = selectedRandomCategory;
111
- if (selectedRandomCategory >= _sp->categories()) {
112
- ratesVec[h] = 0.0;
113
- continue;
114
- }
115
- ratesVec[h] = _sp->rates(selectedRandomCategory);
116
- sumOfRatesAcrossSites += ratesVec[h];
117
- }
118
- if (_saveRates) _siteRates.insert(_siteRates.end(), ratesVec.begin(), ratesVec.end());
119
- // MDOUBLE sumOfRatesNoramlizingFactor = 1.0 / sumOfRatesAcrossSites;
120
-
121
- // _siteSampler = std::make_unique<DiscreteDistribution>(ratesVec, sumOfRatesNoramlizingFactor);
122
- _rootSequence.resize(seqLength);
123
- generateRootSeq(seqLength, ratesVec);
124
- if ((*_nodesToSave)[_et->getRoot()->id()]) saveSequence(_et->getRoot()->id(), _et->getRoot()->name());
125
-
126
- mutateSeqRecuresively(_et->getRoot(), seqLength);
127
- _subManager.clear();
128
- }
129
-
130
- void rateMatrixSim::mutateSeqRecuresively(tree::nodeP currentNode, int seqLength) {
131
- if (currentNode->isLeaf()) return;
132
-
133
- for (auto &node: currentNode->getSons()) {
134
- mutateSeqAlongBranch(node, seqLength);
135
- if ((*_nodesToSave)[node->id()]) saveSequence(node->id(), node->name());
136
- mutateSeqRecuresively(node, seqLength);
137
- if (!_subManager.isEmpty(node->id())) {
138
- _subManager.undoSubs(node->id(), _rootSequence, _rateCategories, _sp.get());
139
- }
140
- }
141
- }
142
-
143
- void rateMatrixSim::mutateSeqAlongBranch(tree::nodeP currentNode, int seqLength) {
144
- const MDOUBLE distToFather = currentNode->dis2father();
145
- mutateEntireSeq(currentNode, seqLength);
146
-
147
- // if (distToFather > 0.5) {
148
- // mutateEntireSeq(currentNode, seqLength);
149
- // } else {
150
- // mutateSeqGillespie(currentNode, seqLength, distToFather);
151
- // }
152
- // testSumOfRates();
153
- }
154
-
155
-
156
- void rateMatrixSim::mutateEntireSeq(tree::nodeP currentNode, int seqLength) {
157
- const int nodeId = currentNode->id();
158
- const int parentId = currentNode->father()->id();
159
-
160
- for (size_t site = 0; site < seqLength; ++site) {
161
- ALPHACHAR parentChar = _rootSequence[site];//_subManager.getCharacter(parentId, site, _rootSequence);
162
- if (_rateCategories[site] == _sp->categories()) continue;
163
- ALPHACHAR nextChar = _cpijGam.getRandomChar(_rateCategories[site], nodeId, parentChar);
164
- if (nextChar != parentChar){
165
- _subManager.handleEvent(nodeId, site, nextChar, _rateCategories, _sp.get(), _rootSequence);
166
- }
167
- }
168
- }
169
-
170
-
171
- void rateMatrixSim::mutateSeqGillespie(tree::nodeP currentNode, int seqLength, MDOUBLE distToParent) {
172
- // std::cout << "mutating sequence using Gillespie!\n";
173
-
174
- const int nodeId = currentNode->id();
175
- const int parentId = currentNode->father()->id();
176
- MDOUBLE branchLength = distToParent;
177
-
178
- double lambdaParam = _subManager.getReactantsSum();
179
- std::exponential_distribution<double> distribution(-lambdaParam);
180
- double waitingTime = distribution(*_mt_rand);
181
- if (waitingTime < 0) {
182
- std::cout << branchLength << " " << lambdaParam << " " << waitingTime << "\n";
183
- errorMsg::reportError("waiting time is negative :(");
184
- }
185
- while (waitingTime < branchLength) {
186
- if (waitingTime < 0) {
187
- std::cout << branchLength << " " << lambdaParam << " " << waitingTime << "\n";
188
- errorMsg::reportError("waiting time is negative :(");
189
- }
190
-
191
- int mutatedSite = _subManager.sampleSite(*_mt_rand);
192
- ALPHACHAR parentChar = _rootSequence[mutatedSite];
193
- ALPHACHAR nextChar = _gillespieSampler[parentChar]->drawSample() - 1;
194
- // std::cout << (int)parentChar << "->" << (int)nextChar << "\n";
195
- _subManager.handleEvent(nodeId, mutatedSite, nextChar, _rateCategories, _sp.get(), _rootSequence);
196
-
197
- lambdaParam = _subManager.getReactantsSum();
198
- branchLength = branchLength - waitingTime;
199
- std::exponential_distribution<double> distribution(-lambdaParam);
200
- waitingTime = distribution(*_mt_rand);
201
-
202
- }
203
- }
204
-
205
-
206
-
207
-
208
- void rateMatrixSim::generateRootSeq(int seqLength, std::vector<MDOUBLE>& ratesVec) {
209
- size_t rootID = _et->getRoot()->id();
210
- for (int i = 0; i < seqLength; i++) {
211
- ALPHACHAR newChar = _frequencySampler->drawSample() - 1;
212
- // ratesVec[i] = ratesVec[i]*(-_sp->Qij(newChar, newChar));
213
- _rootSequence[i] = newChar;
214
- }
215
- // std::cout << ">Root-sequence\n" << _rootSequence << "\n";
216
- // std::cout << ">Rates\n" << ratesVec;
217
- _subManager.handleRootSequence(seqLength, ratesVec, _sp.get(), _rootSequence);
218
-
219
- _rootSequence.setAlphabet(_alph);
220
- _rootSequence.setName(_et->getRoot()->name());
221
- _rootSequence.setID(_et->getRoot()->id());
222
- }
223
-
224
-
225
- void rateMatrixSim::saveSequence(const int &nodeId,const std::string &name) {
226
- sequence temp(_rootSequence);
227
- temp.setName(name);
228
- temp.setID(nodeId);
229
- // std::cout << temp << "\n";
230
- _simulatedSequences->add(temp);
231
- }
232
-
233
- // sequenceContainer rateMatrixSim::toSeqData() {
234
- // sequenceContainer myseqData;
235
- // for (int i=0; i < _simulatedSequences.size(); ++i) {
236
- // myseqData.add(*_simulatedSequences[i]);
237
- // }
238
- // return myseqData;
239
- // }
240
-
241
-
242
-
243
- std::unique_ptr<sequenceContainer> rateMatrixSim::getSequenceContainer() {
244
- // std::unique_ptr<sequenceContainer> myseqData = std::make_unique<sequenceContainer>();
245
- // // sequenceContainer myseqData;
246
- // for (int i=0; i < _simulatedSequences.size(); ++i) {
247
- // tree::nodeP theCurNode = _et->findNodeById(_simulatedSequences[i]->id());
248
- // if (theCurNode == NULL)
249
- // errorMsg::reportError("could not find the specified name: " + _simulatedSequences[i]->name());
250
- // if (theCurNode->isInternal()) continue;
251
- auto outputSequences = std::move(_simulatedSequences);
252
- _simulatedSequences = std::make_unique<sequenceContainer>();
253
- // myseqData->add(*std::move(_simulatedSequences[i]));
254
- // }
255
-
256
- return outputSequences;
257
- }
258
-
259
-
260
- bool rateMatrixSim::testSumOfRates() {
261
- MDOUBLE sumOfRates = 0.0;
262
- for (size_t i = 0; i < _rootSequence.seqLen(); i++) {
263
- ALPHACHAR currentChar = _rootSequence[i];
264
- MDOUBLE currentQii = _sp->Qij(currentChar, currentChar);
265
- MDOUBLE currentRate = _sp->rates(_rateCategories[i]);
266
- sumOfRates += (currentQii*currentRate);
267
- }
268
- MDOUBLE preCalculatedSum = _subManager.getReactantsSum();
269
- if (abs(preCalculatedSum - sumOfRates) > 1e-6) {
270
- std::cout << "preCalculatedSum=" << preCalculatedSum << " "
271
- << "sumOfRates=" << sumOfRates;
272
- errorMsg::reportError("Error in sum of rates calculation!");
273
- }
274
- std::cout << "preCalculatedSum is correct\n" << "preCalculatedSum=" << preCalculatedSum << " "
275
- << "sumOfRates=" << sumOfRates << "\n";
276
-
277
- return true;
278
- }
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes