phunc 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phunc-1.1.0/LICENSE +21 -0
- phunc-1.1.0/PKG-INFO +97 -0
- phunc-1.1.0/README.md +70 -0
- phunc-1.1.0/phunc.egg-info/PKG-INFO +97 -0
- phunc-1.1.0/phunc.egg-info/SOURCES.txt +11 -0
- phunc-1.1.0/phunc.egg-info/dependency_links.txt +1 -0
- phunc-1.1.0/phunc.egg-info/entry_points.txt +2 -0
- phunc-1.1.0/phunc.egg-info/requires.txt +5 -0
- phunc-1.1.0/phunc.egg-info/top_level.txt +1 -0
- phunc-1.1.0/setup.cfg +4 -0
- phunc-1.1.0/setup.py +33 -0
- phunc-1.1.0/src/__init__.py +1 -0
- phunc-1.1.0/src/phunc.py +289 -0
phunc-1.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Guilherme Azevedo
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
phunc-1.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: phunc
|
|
3
|
+
Version: 1.1.0
|
|
4
|
+
Summary: A CLI tool to calculate the probability of fixation of differences in a hypothetical nuclear locus that controls phenotype under neutral divergence.
|
|
5
|
+
Home-page: https://github.com/ghfazevedo/phunc
|
|
6
|
+
Author: Guilherme Azevedo
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: Unix
|
|
10
|
+
Requires-Python: >=3.6
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: dendropy
|
|
14
|
+
Requires-Dist: matplotlib
|
|
15
|
+
Requires-Dist: pandas
|
|
16
|
+
Requires-Dist: numpy
|
|
17
|
+
Requires-Dist: scipy
|
|
18
|
+
Dynamic: author
|
|
19
|
+
Dynamic: classifier
|
|
20
|
+
Dynamic: description
|
|
21
|
+
Dynamic: description-content-type
|
|
22
|
+
Dynamic: home-page
|
|
23
|
+
Dynamic: license-file
|
|
24
|
+
Dynamic: requires-dist
|
|
25
|
+
Dynamic: requires-python
|
|
26
|
+
Dynamic: summary
|
|
27
|
+
|
|
28
|
+
# Phenotype evolution probability Under Neutral Coalescent (PhUNC)
|
|
29
|
+
|
|
30
|
+
This is a python tool to simulate a hypothetical nuclear locus that controls a phenotypic character under the coalescent process and to calculate the probability of this character to be fixed in alternate states on different populations. The method was first proposed by [Masta & Maddison (2002)](https://doi.org/10.1073/pnas.072493099) and this program was used in [Azevedo et al. 2026)](https://doi.org/10.1093/evolut/qpag049). The model assumes that the phenotypic states of the character is controlled by a single mutation in one locus and that the mutation rate is the slowest possible (parsimony).
|
|
31
|
+
The program can be used to assess the probability of drift leading to fixation, using number of traits = number of species in tree, or to evaluate the probability of different hemiplasy scenarios using binary state with more than two species.
|
|
32
|
+
|
|
33
|
+
If you use this program, please cite [Azevedo et al. 2026)](https://doi.org/10.1093/evolut/qpag049) and refer to [this github page](https://github.com/ghfazevedo/phunc).
|
|
34
|
+
|
|
35
|
+
## Installation
|
|
36
|
+
|
|
37
|
+
This program uses [DendroPy](https://jeetsukumaran.github.io/DendroPy) library that is installed automatically as a dependency.
|
|
38
|
+
Please cite [DendroPy](https://jeetsukumaran.github.io/DendroPy).
|
|
39
|
+
|
|
40
|
+
To install PhUnC clone this github page and use pip.
|
|
41
|
+
|
|
42
|
+
```
|
|
43
|
+
git clone https://github.com/ghfazevedo/phunc
|
|
44
|
+
cd phunc
|
|
45
|
+
pip install .
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Example usage
|
|
49
|
+
```bash
|
|
50
|
+
phunc -t data/tree.nwck -n 1000 -s 10,10,10,10 -ts 3
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
You can also provide a trait matrix (tab delimited) if you want to use a number of states that is different from the total amount of species in the species tree. This can be useful to relax the assumptions that all states must be fixed, to use polymorphisms or uncertain coding ([0,1] meaning 0 or 1), or to explore probabilities of differen hemiplasic scenarios.
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
phunc -t data/tree.nwck -n 1000 -s 10,10,10,10 -ts 2 -p data/phenotype_map.txt -o phenofun_out_wMatrix
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## Outputs
|
|
60
|
+
The program print the probability of the target *s* to the terminal and creates the files:
|
|
61
|
+
1. [S_statsProbs](phenofun_out/S_statsProbs.txt) with the probability of the observed s statistics provided.
|
|
62
|
+
2. [simulated_gene_trees.nwck](phenofun_out/simulated_gene_trees.nwck) with all simulated trees.
|
|
63
|
+
3. [target_gene_trees.nwck](phenofun_out/target_gene_trees.nwck) with only gene trees that show the target *s*.
|
|
64
|
+
4. [simulated_s.csv](phenofun_out/simulated_s.csv) with all values of *s* for all simulations.
|
|
65
|
+
5. [histogram.pdf](phenofun_out/histogram.pdf) and [histogram.png](phenofun_out/histogram.png) which are the histogram plots with PDF estimated curve, with the inferior 5% inferior percentile marked in red, and with a vertical red line showing the target *s* value.
|
|
66
|
+

|
|
67
|
+
6. [barplot.pdf](phenofun_out/histogram.pdf) and [barplot.png](phenofun_out/histogram.png) which are the plot with probability of target s given drift and probability of s being different from target.
|
|
68
|
+

|
|
69
|
+
|
|
70
|
+
## Command Options
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
usage: phunc [-h] -t TREE [-o OUT_DIR] [-n N_SIMULATIONS] -s N_SAMPLED_INDIVIDUALS -ts TARGET_S_STATISTICS [-p PHENOTYPE_MAP] [-v]
|
|
74
|
+
|
|
75
|
+
Calculates the probability of fixation of differences in a hypothetical nuclear locus that controls phenotype under neutral divergence.
|
|
76
|
+
|
|
77
|
+
options:
|
|
78
|
+
-h, --help show this help message and exit
|
|
79
|
+
-t, --tree TREE Path to the population/species tree with population size as branch annotations (Nexus format)
|
|
80
|
+
-o, --out_dir OUT_DIR
|
|
81
|
+
Output directory
|
|
82
|
+
-n, --n_simulations N_SIMULATIONS
|
|
83
|
+
Number of gene trees to simulate.
|
|
84
|
+
-s, --n_sampled_individuals N_SAMPLED_INDIVIDUALS
|
|
85
|
+
The number of individuals per population/species separated by comma. It should be in the same order as the populations
|
|
86
|
+
appear in the species tree file.
|
|
87
|
+
-ts, --target_s_statistics TARGET_S_STATISTICS
|
|
88
|
+
The target s statistics as observed in the real world data to calculate the probability of generating it through the
|
|
89
|
+
simulations.
|
|
90
|
+
-p, --phenotype_map PHENOTYPE_MAP
|
|
91
|
+
Optional: Path to a tab-separated file associating species in the tree with phenotype codes. Format: one line per
|
|
92
|
+
species, e.g. 'species1 0', 'species2 0', 'species3 1', 'species4 [0,1]'. Use [0,1] for uncertain or polymorphic states.
|
|
93
|
+
If provided, this file will be used to assign phenotype states to taxa instead of automatically set one different state
|
|
94
|
+
per species on tree.
|
|
95
|
+
-v, --version show program's version number and exit
|
|
96
|
+
```
|
|
97
|
+
|
phunc-1.1.0/README.md
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# Phenotype evolution probability Under Neutral Coalescent (PhUNC)
|
|
2
|
+
|
|
3
|
+
This is a python tool to simulate a hypothetical nuclear locus that controls a phenotypic character under the coalescent process and to calculate the probability of this character to be fixed in alternate states on different populations. The method was first proposed by [Masta & Maddison (2002)](https://doi.org/10.1073/pnas.072493099) and this program was used in [Azevedo et al. 2026)](https://doi.org/10.1093/evolut/qpag049). The model assumes that the phenotypic states of the character is controlled by a single mutation in one locus and that the mutation rate is the slowest possible (parsimony).
|
|
4
|
+
The program can be used to assess the probability of drift leading to fixation, using number of traits = number of species in tree, or to evaluate the probability of different hemiplasy scenarios using binary state with more than two species.
|
|
5
|
+
|
|
6
|
+
If you use this program, please cite [Azevedo et al. 2026)](https://doi.org/10.1093/evolut/qpag049) and refer to [this github page](https://github.com/ghfazevedo/phunc).
|
|
7
|
+
|
|
8
|
+
## Installation
|
|
9
|
+
|
|
10
|
+
This program uses [DendroPy](https://jeetsukumaran.github.io/DendroPy) library that is installed automatically as a dependency.
|
|
11
|
+
Please cite [DendroPy](https://jeetsukumaran.github.io/DendroPy).
|
|
12
|
+
|
|
13
|
+
To install PhUnC clone this github page and use pip.
|
|
14
|
+
|
|
15
|
+
```
|
|
16
|
+
git clone https://github.com/ghfazevedo/phunc
|
|
17
|
+
cd phunc
|
|
18
|
+
pip install .
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Example usage
|
|
22
|
+
```bash
|
|
23
|
+
phunc -t data/tree.nwck -n 1000 -s 10,10,10,10 -ts 3
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
You can also provide a trait matrix (tab delimited) if you want to use a number of states that is different from the total amount of species in the species tree. This can be useful to relax the assumptions that all states must be fixed, to use polymorphisms or uncertain coding ([0,1] meaning 0 or 1), or to explore probabilities of differen hemiplasic scenarios.
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
phunc -t data/tree.nwck -n 1000 -s 10,10,10,10 -ts 2 -p data/phenotype_map.txt -o phenofun_out_wMatrix
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Outputs
|
|
33
|
+
The program print the probability of the target *s* to the terminal and creates the files:
|
|
34
|
+
1. [S_statsProbs](phenofun_out/S_statsProbs.txt) with the probability of the observed s statistics provided.
|
|
35
|
+
2. [simulated_gene_trees.nwck](phenofun_out/simulated_gene_trees.nwck) with all simulated trees.
|
|
36
|
+
3. [target_gene_trees.nwck](phenofun_out/target_gene_trees.nwck) with only gene trees that show the target *s*.
|
|
37
|
+
4. [simulated_s.csv](phenofun_out/simulated_s.csv) with all values of *s* for all simulations.
|
|
38
|
+
5. [histogram.pdf](phenofun_out/histogram.pdf) and [histogram.png](phenofun_out/histogram.png) which are the histogram plots with PDF estimated curve, with the inferior 5% inferior percentile marked in red, and with a vertical red line showing the target *s* value.
|
|
39
|
+

|
|
40
|
+
6. [barplot.pdf](phenofun_out/histogram.pdf) and [barplot.png](phenofun_out/histogram.png) which are the plot with probability of target s given drift and probability of s being different from target.
|
|
41
|
+

|
|
42
|
+
|
|
43
|
+
## Command Options
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
usage: phunc [-h] -t TREE [-o OUT_DIR] [-n N_SIMULATIONS] -s N_SAMPLED_INDIVIDUALS -ts TARGET_S_STATISTICS [-p PHENOTYPE_MAP] [-v]
|
|
47
|
+
|
|
48
|
+
Calculates the probability of fixation of differences in a hypothetical nuclear locus that controls phenotype under neutral divergence.
|
|
49
|
+
|
|
50
|
+
options:
|
|
51
|
+
-h, --help show this help message and exit
|
|
52
|
+
-t, --tree TREE Path to the population/species tree with population size as branch annotations (Nexus format)
|
|
53
|
+
-o, --out_dir OUT_DIR
|
|
54
|
+
Output directory
|
|
55
|
+
-n, --n_simulations N_SIMULATIONS
|
|
56
|
+
Number of gene trees to simulate.
|
|
57
|
+
-s, --n_sampled_individuals N_SAMPLED_INDIVIDUALS
|
|
58
|
+
The number of individuals per population/species separated by comma. It should be in the same order as the populations
|
|
59
|
+
appear in the species tree file.
|
|
60
|
+
-ts, --target_s_statistics TARGET_S_STATISTICS
|
|
61
|
+
The target s statistics as observed in the real world data to calculate the probability of generating it through the
|
|
62
|
+
simulations.
|
|
63
|
+
-p, --phenotype_map PHENOTYPE_MAP
|
|
64
|
+
Optional: Path to a tab-separated file associating species in the tree with phenotype codes. Format: one line per
|
|
65
|
+
species, e.g. 'species1 0', 'species2 0', 'species3 1', 'species4 [0,1]'. Use [0,1] for uncertain or polymorphic states.
|
|
66
|
+
If provided, this file will be used to assign phenotype states to taxa instead of automatically set one different state
|
|
67
|
+
per species on tree.
|
|
68
|
+
-v, --version show program's version number and exit
|
|
69
|
+
```
|
|
70
|
+
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: phunc
|
|
3
|
+
Version: 1.1.0
|
|
4
|
+
Summary: A CLI tool to calculate the probability of fixation of differences in a hypothetical nuclear locus that controls phenotype under neutral divergence.
|
|
5
|
+
Home-page: https://github.com/ghfazevedo/phunc
|
|
6
|
+
Author: Guilherme Azevedo
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: Unix
|
|
10
|
+
Requires-Python: >=3.6
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: dendropy
|
|
14
|
+
Requires-Dist: matplotlib
|
|
15
|
+
Requires-Dist: pandas
|
|
16
|
+
Requires-Dist: numpy
|
|
17
|
+
Requires-Dist: scipy
|
|
18
|
+
Dynamic: author
|
|
19
|
+
Dynamic: classifier
|
|
20
|
+
Dynamic: description
|
|
21
|
+
Dynamic: description-content-type
|
|
22
|
+
Dynamic: home-page
|
|
23
|
+
Dynamic: license-file
|
|
24
|
+
Dynamic: requires-dist
|
|
25
|
+
Dynamic: requires-python
|
|
26
|
+
Dynamic: summary
|
|
27
|
+
|
|
28
|
+
# Phenotype evolution probability Under Neutral Coalescent (PhUNC)
|
|
29
|
+
|
|
30
|
+
This is a python tool to simulate a hypothetical nuclear locus that controls a phenotypic character under the coalescent process and to calculate the probability of this character to be fixed in alternate states on different populations. The method was first proposed by [Masta & Maddison (2002)](https://doi.org/10.1073/pnas.072493099) and this program was used in [Azevedo et al. 2026)](https://doi.org/10.1093/evolut/qpag049). The model assumes that the phenotypic states of the character is controlled by a single mutation in one locus and that the mutation rate is the slowest possible (parsimony).
|
|
31
|
+
The program can be used to assess the probability of drift leading to fixation, using number of traits = number of species in tree, or to evaluate the probability of different hemiplasy scenarios using binary state with more than two species.
|
|
32
|
+
|
|
33
|
+
If you use this program, please cite [Azevedo et al. 2026)](https://doi.org/10.1093/evolut/qpag049) and refer to [this github page](https://github.com/ghfazevedo/phunc).
|
|
34
|
+
|
|
35
|
+
## Installation
|
|
36
|
+
|
|
37
|
+
This program uses [DendroPy](https://jeetsukumaran.github.io/DendroPy) library that is installed automatically as a dependency.
|
|
38
|
+
Please cite [DendroPy](https://jeetsukumaran.github.io/DendroPy).
|
|
39
|
+
|
|
40
|
+
To install PhUnC clone this github page and use pip.
|
|
41
|
+
|
|
42
|
+
```
|
|
43
|
+
git clone https://github.com/ghfazevedo/phunc
|
|
44
|
+
cd phunc
|
|
45
|
+
pip install .
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Example usage
|
|
49
|
+
```bash
|
|
50
|
+
phunc -t data/tree.nwck -n 1000 -s 10,10,10,10 -ts 3
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
You can also provide a trait matrix (tab delimited) if you want to use a number of states that is different from the total amount of species in the species tree. This can be useful to relax the assumptions that all states must be fixed, to use polymorphisms or uncertain coding ([0,1] meaning 0 or 1), or to explore probabilities of differen hemiplasic scenarios.
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
phunc -t data/tree.nwck -n 1000 -s 10,10,10,10 -ts 2 -p data/phenotype_map.txt -o phenofun_out_wMatrix
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## Outputs
|
|
60
|
+
The program print the probability of the target *s* to the terminal and creates the files:
|
|
61
|
+
1. [S_statsProbs](phenofun_out/S_statsProbs.txt) with the probability of the observed s statistics provided.
|
|
62
|
+
2. [simulated_gene_trees.nwck](phenofun_out/simulated_gene_trees.nwck) with all simulated trees.
|
|
63
|
+
3. [target_gene_trees.nwck](phenofun_out/target_gene_trees.nwck) with only gene trees that show the target *s*.
|
|
64
|
+
4. [simulated_s.csv](phenofun_out/simulated_s.csv) with all values of *s* for all simulations.
|
|
65
|
+
5. [histogram.pdf](phenofun_out/histogram.pdf) and [histogram.png](phenofun_out/histogram.png) which are the histogram plots with PDF estimated curve, with the inferior 5% inferior percentile marked in red, and with a vertical red line showing the target *s* value.
|
|
66
|
+

|
|
67
|
+
6. [barplot.pdf](phenofun_out/histogram.pdf) and [barplot.png](phenofun_out/histogram.png) which are the plot with probability of target s given drift and probability of s being different from target.
|
|
68
|
+

|
|
69
|
+
|
|
70
|
+
## Command Options
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
usage: phunc [-h] -t TREE [-o OUT_DIR] [-n N_SIMULATIONS] -s N_SAMPLED_INDIVIDUALS -ts TARGET_S_STATISTICS [-p PHENOTYPE_MAP] [-v]
|
|
74
|
+
|
|
75
|
+
Calculates the probability of fixation of differences in a hypothetical nuclear locus that controls phenotype under neutral divergence.
|
|
76
|
+
|
|
77
|
+
options:
|
|
78
|
+
-h, --help show this help message and exit
|
|
79
|
+
-t, --tree TREE Path to the population/species tree with population size as branch annotations (Nexus format)
|
|
80
|
+
-o, --out_dir OUT_DIR
|
|
81
|
+
Output directory
|
|
82
|
+
-n, --n_simulations N_SIMULATIONS
|
|
83
|
+
Number of gene trees to simulate.
|
|
84
|
+
-s, --n_sampled_individuals N_SAMPLED_INDIVIDUALS
|
|
85
|
+
The number of individuals per population/species separated by comma. It should be in the same order as the populations
|
|
86
|
+
appear in the species tree file.
|
|
87
|
+
-ts, --target_s_statistics TARGET_S_STATISTICS
|
|
88
|
+
The target s statistics as observed in the real world data to calculate the probability of generating it through the
|
|
89
|
+
simulations.
|
|
90
|
+
-p, --phenotype_map PHENOTYPE_MAP
|
|
91
|
+
Optional: Path to a tab-separated file associating species in the tree with phenotype codes. Format: one line per
|
|
92
|
+
species, e.g. 'species1 0', 'species2 0', 'species3 1', 'species4 [0,1]'. Use [0,1] for uncertain or polymorphic states.
|
|
93
|
+
If provided, this file will be used to assign phenotype states to taxa instead of automatically set one different state
|
|
94
|
+
per species on tree.
|
|
95
|
+
-v, --version show program's version number and exit
|
|
96
|
+
```
|
|
97
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
phunc
|
phunc-1.1.0/setup.cfg
ADDED
phunc-1.1.0/setup.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
setup(
|
|
4
|
+
name="phunc",
|
|
5
|
+
version="1.1.0",
|
|
6
|
+
author="Guilherme Azevedo",
|
|
7
|
+
description="A CLI tool to calculate the probability of fixation of differences in a hypothetical nuclear locus that controls phenotype under neutral divergence.",
|
|
8
|
+
long_description=open("README.md").read(),
|
|
9
|
+
long_description_content_type="text/markdown",
|
|
10
|
+
url="https://github.com/ghfazevedo/phunc",
|
|
11
|
+
packages=["phunc"],
|
|
12
|
+
package_dir={"phunc": "src/"},
|
|
13
|
+
include_package_data=True,
|
|
14
|
+
install_requires=[
|
|
15
|
+
'dendropy',
|
|
16
|
+
'matplotlib',
|
|
17
|
+
'pandas',
|
|
18
|
+
'numpy',
|
|
19
|
+
'scipy'
|
|
20
|
+
],
|
|
21
|
+
entry_points={
|
|
22
|
+
'console_scripts': [
|
|
23
|
+
'phunc = phunc.phunc:main',
|
|
24
|
+
]
|
|
25
|
+
},
|
|
26
|
+
classifiers=[
|
|
27
|
+
"Programming Language :: Python :: 3",
|
|
28
|
+
"License :: OSI Approved :: MIT License",
|
|
29
|
+
"Operating System :: Unix",
|
|
30
|
+
],
|
|
31
|
+
python_requires='>=3.6',
|
|
32
|
+
)
|
|
33
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.1.0"
|
phunc-1.1.0/src/phunc.py
ADDED
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import os
|
|
5
|
+
import dendropy
|
|
6
|
+
from dendropy.simulate import treesim
|
|
7
|
+
from dendropy.model import reconcile
|
|
8
|
+
from dendropy.model import coalescent
|
|
9
|
+
#from dendropy.model.reconcile import monophyletic_partition_discordance
|
|
10
|
+
from dendropy.model.parsimony import fitch_down_pass
|
|
11
|
+
import matplotlib.pyplot as plt
|
|
12
|
+
import pandas as pd
|
|
13
|
+
import numpy as np
|
|
14
|
+
from scipy.stats import gaussian_kde
|
|
15
|
+
from phunc import __version__
|
|
16
|
+
#from scipy.stats import lognorm
|
|
17
|
+
|
|
18
|
+
# Function to parse arguments
|
|
19
|
+
def parse_arguments():
|
|
20
|
+
parser = argparse.ArgumentParser(description="Calculates the probability of fixation of differences in a hypothetical nuclear locus that controls phenotype under neutral divergence.")
|
|
21
|
+
parser.add_argument("-t", "--tree", required=True, help="Path to the population/species tree with population size as branch annotations (Nexus format)")
|
|
22
|
+
parser.add_argument("-o", "--out_dir", default="./phenofun_out", help="Output directory")
|
|
23
|
+
parser.add_argument("-n", "--n_simulations", default="100", help="Number of gene trees to simulate.")
|
|
24
|
+
parser.add_argument("-s", "--n_sampled_individuals", type=str, required=True, help="The number of individuals per population/species separated by comma. It should be in the same order as the populations appear in the species tree file.")
|
|
25
|
+
parser.add_argument("-ts", "--target_s_statistics", type=int, required=True, help="The target s statistics as observed in the real world data to calculate the probability of generating it through the simulations.")
|
|
26
|
+
parser.add_argument("-p", "--phenotype_map", type=str, required=False,
|
|
27
|
+
help="Optional: Path to a tab-separated file associating species in the tree with phenotype codes. Format: one line per species, e.g. 'species1\t0', 'species2\t0', 'species3\t1', 'species4\t[0,1]'. Use [0,1] for uncertain or polymorphic states. If provided, this file will be used to assign phenotype states to taxa instead of automatically set one different state per species on tree.")
|
|
28
|
+
parser.add_argument('-v', '--version', action='version', version=f'%(prog)s {__version__}')
|
|
29
|
+
|
|
30
|
+
return parser.parse_args()
|
|
31
|
+
|
|
32
|
+
# Function to confirm with the user if directory exists
|
|
33
|
+
def confirm_proceed(message="Directory already exists. Do you want to proceed? (y/n): "):
|
|
34
|
+
while True:
|
|
35
|
+
response = input(message).strip().lower()
|
|
36
|
+
if response == 'y':
|
|
37
|
+
return True
|
|
38
|
+
elif response == 'n':
|
|
39
|
+
print("Exiting program.")
|
|
40
|
+
return False
|
|
41
|
+
else:
|
|
42
|
+
print("Please enter 'y' or 'n'.")
|
|
43
|
+
|
|
44
|
+
def main():
|
|
45
|
+
args = parse_arguments()
|
|
46
|
+
|
|
47
|
+
# Convert out_dir to absolute path
|
|
48
|
+
args.tree = os.path.abspath(args.tree)
|
|
49
|
+
args.out_dir = os.path.abspath(args.out_dir)
|
|
50
|
+
|
|
51
|
+
# Convert the number of individuals to a list
|
|
52
|
+
n_sampled_individuals = list(map(int, args.n_sampled_individuals.split(',')))
|
|
53
|
+
|
|
54
|
+
# convert string to integer
|
|
55
|
+
args.n_simulations=int(args.n_simulations)
|
|
56
|
+
|
|
57
|
+
# Check if the directory exists and confirm with the user
|
|
58
|
+
if os.path.exists(args.out_dir):
|
|
59
|
+
print(f"Warning: Directory '{args.out_dir}' already exists. Proceeding may erase previous outputs.")
|
|
60
|
+
if not confirm_proceed():
|
|
61
|
+
exit()
|
|
62
|
+
|
|
63
|
+
# Create output directory
|
|
64
|
+
if not os.path.exists(args.out_dir):
|
|
65
|
+
os.makedirs(args.out_dir)
|
|
66
|
+
|
|
67
|
+
# Read tree and get taxa names
|
|
68
|
+
containing_taxa = dendropy.TaxonNamespace()
|
|
69
|
+
sp_tree = dendropy.Tree.get(path=args.tree,
|
|
70
|
+
schema="nexus",
|
|
71
|
+
preserve_underscores=True,
|
|
72
|
+
taxon_namespace=containing_taxa)
|
|
73
|
+
|
|
74
|
+
genes_to_species = dendropy.TaxonNamespaceMapping.create_contained_taxon_mapping(
|
|
75
|
+
containing_taxon_namespace=containing_taxa,
|
|
76
|
+
num_contained=n_sampled_individuals)
|
|
77
|
+
|
|
78
|
+
# convert to containing tree
|
|
79
|
+
sp_tree = reconcile.ContainingTree(sp_tree,
|
|
80
|
+
contained_taxon_namespace=genes_to_species.domain_taxon_namespace,
|
|
81
|
+
contained_to_containing_taxon_map=genes_to_species)
|
|
82
|
+
|
|
83
|
+
# Simulate and save gene trees
|
|
84
|
+
trees = dendropy.TreeList()
|
|
85
|
+
print('Simulating trees')
|
|
86
|
+
for rep in range(args.n_simulations):
|
|
87
|
+
print(rep)
|
|
88
|
+
gene_tree = treesim.contained_coalescent_tree(containing_tree=sp_tree, gene_to_containing_taxon_map=genes_to_species)
|
|
89
|
+
trees.append(gene_tree)
|
|
90
|
+
|
|
91
|
+
print('Saving newick simulated trees')
|
|
92
|
+
trees.write(path= os.path.join(args.out_dir, "simulated_gene_trees.nwck"),
|
|
93
|
+
schema="newick"
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# Attribute code (phenotype states) to species
|
|
97
|
+
taxon_state_sets_map = {}
|
|
98
|
+
if args.phenotype_map:
|
|
99
|
+
# Read phenotype map file
|
|
100
|
+
phenotype_map = {}
|
|
101
|
+
with open(args.phenotype_map, 'r') as f:
|
|
102
|
+
for line in f:
|
|
103
|
+
line = line.strip()
|
|
104
|
+
if not line or line.startswith('#'):
|
|
105
|
+
continue
|
|
106
|
+
parts = line.split('\t')
|
|
107
|
+
if len(parts) != 2:
|
|
108
|
+
raise ValueError(f"Invalid phenotype_map line: {line}")
|
|
109
|
+
species, code_str = parts
|
|
110
|
+
# Handle code as int, list, or set
|
|
111
|
+
if code_str.startswith('[') and code_str.endswith(']'):
|
|
112
|
+
# Parse list, e.g. [0,1]
|
|
113
|
+
code = set(eval(code_str))
|
|
114
|
+
else:
|
|
115
|
+
code = set([int(code_str)])
|
|
116
|
+
phenotype_map[species] = code
|
|
117
|
+
for taxon in trees.taxon_namespace:
|
|
118
|
+
species = taxon.label.split()[0]
|
|
119
|
+
if species not in phenotype_map:
|
|
120
|
+
raise ValueError(f"Species '{species}' not found in phenotype_map file.")
|
|
121
|
+
taxon_state_sets_map[taxon] = [phenotype_map[species]]
|
|
122
|
+
else:
|
|
123
|
+
species_to_code = {}
|
|
124
|
+
current_code = 0
|
|
125
|
+
for taxon in trees.taxon_namespace:
|
|
126
|
+
species = taxon.label.split()[0]
|
|
127
|
+
if species not in species_to_code:
|
|
128
|
+
species_to_code[species] = current_code
|
|
129
|
+
current_code += 1
|
|
130
|
+
code = species_to_code[species]
|
|
131
|
+
taxon_state_sets_map[taxon] = [set([code])]
|
|
132
|
+
|
|
133
|
+
# Iterate over trees to calculate s
|
|
134
|
+
s_count = 0
|
|
135
|
+
target_trees = dendropy.TreeList()
|
|
136
|
+
s_distribution = []
|
|
137
|
+
|
|
138
|
+
for tree in trees:
|
|
139
|
+
s = fitch_down_pass(tree.postorder_node_iter(),
|
|
140
|
+
taxon_state_sets_map=taxon_state_sets_map)
|
|
141
|
+
s_distribution.append(s)
|
|
142
|
+
if s == args.target_s_statistics:
|
|
143
|
+
s_count = s_count + 1
|
|
144
|
+
target_trees.append(tree)
|
|
145
|
+
|
|
146
|
+
#The commented code below uses the monophyletic_partition_discordance() which, contrary to statement in DendroPy manual,
|
|
147
|
+
# does not seem to be exactly the s statistics. So I chenged to the code above.
|
|
148
|
+
|
|
149
|
+
## Create the function to get species name of taxon object label of the gene trees.
|
|
150
|
+
## This will be used for the taxa membership, since the simulated tree has taxa with names like "species1 0", "species1 1", "species2 0"
|
|
151
|
+
#def mf(t):
|
|
152
|
+
# index=t.label.find(" ")
|
|
153
|
+
# return t.label[:index]
|
|
154
|
+
#for tree in trees:
|
|
155
|
+
# taxon_namespace = tree.taxon_namespace
|
|
156
|
+
# tax_parts = taxon_namespace.partition(membership_func=mf)
|
|
157
|
+
# s = monophyletic_partition_discordance(tree, taxon_namespace_partition=tax_parts)
|
|
158
|
+
# s_distribution.append(s)
|
|
159
|
+
# if s == args.target_s_statistics:
|
|
160
|
+
# s_count = s_count + 1
|
|
161
|
+
# target_trees.append(tree)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
probability = s_count/args.n_simulations
|
|
165
|
+
|
|
166
|
+
print('Saving newick simulated trees with target s-statistics')
|
|
167
|
+
target_trees.write(path= os.path.join(args.out_dir, "target_gene_trees.nwck"),
|
|
168
|
+
schema="newick"
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
print(f"Probability of s statistics being equal to {args.target_s_statistics}: {probability}")
|
|
172
|
+
|
|
173
|
+
results = open(os.path.join(args.out_dir, "S_statsProbs.txt"), "w")
|
|
174
|
+
print(f"Probability of s statistics being equal to '{args.target_s_statistics}': '{probability}'", file=results)
|
|
175
|
+
results.close()
|
|
176
|
+
|
|
177
|
+
# Save simulated s values
|
|
178
|
+
df = pd.DataFrame({'simulation': range(1, len(s_distribution) + 1), 's_statistic': s_distribution})
|
|
179
|
+
df.to_csv(os.path.join(args.out_dir,"simulated_s.csv"), index=False)
|
|
180
|
+
print("Simulated s values saved in", os.path.join(args.out_dir,"simulated_s.csv") )
|
|
181
|
+
|
|
182
|
+
# Create a histogram of s values
|
|
183
|
+
# Compute lower the 95% confidence interval
|
|
184
|
+
lower_bound = np.percentile(s_distribution, 5.0)
|
|
185
|
+
|
|
186
|
+
# Create histogram
|
|
187
|
+
# Set number of bins to the maximum value in s_distribution
|
|
188
|
+
max_s = int(max(s_distribution))
|
|
189
|
+
# Create integer bins from min to max (inclusive)
|
|
190
|
+
bins = np.arange(int(min(s_distribution)), max_s + 2) # +2 to include the last value
|
|
191
|
+
hist_values, bin_edges, patches = plt.hist(
|
|
192
|
+
s_distribution,
|
|
193
|
+
bins=bins,
|
|
194
|
+
density=True,
|
|
195
|
+
edgecolor='black',
|
|
196
|
+
alpha=0.7,
|
|
197
|
+
histtype='bar',
|
|
198
|
+
rwidth=1.0 # bars touch each other
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
# Color bars conditionally
|
|
202
|
+
for patch, left_edge in zip(patches, bin_edges[:-1]):
|
|
203
|
+
if left_edge < lower_bound:
|
|
204
|
+
patch.set_facecolor('red') # Outliers in red
|
|
205
|
+
else:
|
|
206
|
+
patch.set_facecolor('blue') # Normal density in blue
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
# Simple histogram without 95% interval
|
|
210
|
+
#plt.hist(s_distribution, bins=10, edgecolor='black', alpha=0.7)
|
|
211
|
+
|
|
212
|
+
# Add vertical line at 'target'
|
|
213
|
+
plt.axvline(x=args.target_s_statistics, color='r',
|
|
214
|
+
linestyle='dashed', linewidth=2,
|
|
215
|
+
label=f'Observed s = {args.target_s_statistics}')
|
|
216
|
+
|
|
217
|
+
# Compute and plot the smooth PDF curve with gaussian
|
|
218
|
+
#kde = gaussian_kde(s_distribution) # Kernel Density Estimation
|
|
219
|
+
#x_vals = np.linspace(min(s_distribution), max(s_distribution), 1000) # X range for smooth curve
|
|
220
|
+
#plt.plot(x_vals, kde(x_vals), color='black', linewidth=2, label="PDF Curve") # Smooth PDF
|
|
221
|
+
|
|
222
|
+
# Compute and plot the smooth PDF curve with lognormal distribution
|
|
223
|
+
#shape, loc, scale = lognorm.fit(s_distribution, floc=0) # Estimate parameters
|
|
224
|
+
#x_vals = np.linspace(min(s_distribution), max(s_distribution), 1000) # Smooth x values
|
|
225
|
+
#pdf_vals = lognorm.pdf(x_vals, shape, loc, scale) # Compute lognormal PDF
|
|
226
|
+
## Plot the lognormal PDF curve
|
|
227
|
+
#plt.plot(x_vals, pdf_vals, color='black', linewidth=2, linestyle='dashed', label="Lognormal PDF")
|
|
228
|
+
|
|
229
|
+
# Add Label at the Top, Close to the Line
|
|
230
|
+
y_top = plt.ylim()[1] * 1.01 # Position above the highest histogram bar
|
|
231
|
+
plt.text(args.target_s_statistics, y_top, f"p(s={args.target_s_statistics})={probability}", color='black', ha='center', va='bottom', fontsize=12, fontweight='bold')
|
|
232
|
+
|
|
233
|
+
# Labels and title
|
|
234
|
+
plt.xlabel('Simulated s statistics')
|
|
235
|
+
plt.ylabel('Probability')
|
|
236
|
+
|
|
237
|
+
# Save as PNG and PDF
|
|
238
|
+
plt.savefig(os.path.join(args.out_dir, "histogram.png"), dpi=300)
|
|
239
|
+
plt.savefig(os.path.join(args.out_dir,"histogram.pdf") )
|
|
240
|
+
|
|
241
|
+
print("Histograms saved in", os.path.join(args.out_dir,"histogram.pdf"), "and in", os.path.join(args.out_dir,"histogram.png") )
|
|
242
|
+
|
|
243
|
+
# BAR PLOT: p(s=target | drift) vs p(s ≠ target) with evidence lines ---
|
|
244
|
+
prob_target = probability
|
|
245
|
+
prob_different = 1 - probability
|
|
246
|
+
bar_labels = [f'p(s = {args.target_s_statistics} | drift)', f'p(s ≠ {args.target_s_statistics} | drift)']
|
|
247
|
+
bar_values = [prob_target, prob_different]
|
|
248
|
+
|
|
249
|
+
fig, ax = plt.subplots(figsize=(6, 6))
|
|
250
|
+
bars = ax.bar(bar_labels, bar_values, color=['#377eb8', '#e41a1c'], edgecolor='black', alpha=0.8)
|
|
251
|
+
|
|
252
|
+
# Evidence lines and labels (adapted from R code)
|
|
253
|
+
BF_vals = np.array([3.2, 10, 100])
|
|
254
|
+
prior = 0.5
|
|
255
|
+
strength = BF_vals / (BF_vals + 1)
|
|
256
|
+
y_lines = np.concatenate(([prior], strength))
|
|
257
|
+
labels = ["no support", "weak", "substantial", "strong"]
|
|
258
|
+
|
|
259
|
+
# Draw horizontal lines
|
|
260
|
+
ax.axhline(prior, linestyle='solid', color='grey', linewidth=1)
|
|
261
|
+
linestyles = ['dotted', 'dashed', (0, (5, 10))] # solid already used for prior
|
|
262
|
+
for y, ls in zip(strength, linestyles):
|
|
263
|
+
ax.axhline(y, linestyle=ls, color='grey', linewidth=1)
|
|
264
|
+
|
|
265
|
+
# Annotate evidence labels at the right edge
|
|
266
|
+
y_for_labels = y_lines
|
|
267
|
+
for y, label in zip(y_for_labels, labels):
|
|
268
|
+
ax.text(1.05, y, label, ha='left', va='center', color='grey', fontsize=10, transform=ax.get_yaxis_transform())
|
|
269
|
+
|
|
270
|
+
# Annotate bar values
|
|
271
|
+
for bar in bars:
|
|
272
|
+
height = bar.get_height()
|
|
273
|
+
ax.annotate(f'{height:.3f}',
|
|
274
|
+
xy=(bar.get_x() + bar.get_width() / 2, height),
|
|
275
|
+
xytext=(0, 5),
|
|
276
|
+
textcoords="offset points",
|
|
277
|
+
ha='center', va='bottom', fontsize=12, fontweight='bold')
|
|
278
|
+
|
|
279
|
+
ax.set_ylim(0, 1.05)
|
|
280
|
+
ax.set_ylabel('Probability')
|
|
281
|
+
ax.set_title('Probability of s = target and s > target')
|
|
282
|
+
|
|
283
|
+
plt.tight_layout()
|
|
284
|
+
plt.savefig(os.path.join(args.out_dir, "barplot.png"), dpi=300)
|
|
285
|
+
plt.savefig(os.path.join(args.out_dir, "barplot.pdf"))
|
|
286
|
+
print("Bar plot saved in", os.path.join(args.out_dir, "barplot.pdf"), "and in", os.path.join(args.out_dir, "barplot.png"))
|
|
287
|
+
|
|
288
|
+
if __name__ == "__main__":
|
|
289
|
+
main()
|