egt 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of egt might be problematic. Click here for more details.
- egt-0.1.0/LICENSE +21 -0
- egt-0.1.0/PKG-INFO +285 -0
- egt-0.1.0/README.md +245 -0
- egt-0.1.0/pyproject.toml +66 -0
- egt-0.1.0/setup.cfg +4 -0
- egt-0.1.0/src/egt/__init__.py +1 -0
- egt-0.1.0/src/egt/_vendor/__init__.py +0 -0
- egt-0.1.0/src/egt/_vendor/odp_plotting_functions.py +81 -0
- egt-0.1.0/src/egt/aggregate_filechecker_benchmarks.py +93 -0
- egt-0.1.0/src/egt/aggregate_filesizes.py +50 -0
- egt-0.1.0/src/egt/algs_split_across_scaffolds.py +204 -0
- egt-0.1.0/src/egt/annotate_sample_df.py +628 -0
- egt-0.1.0/src/egt/bokeh_helper.py +33 -0
- egt-0.1.0/src/egt/cli.py +76 -0
- egt-0.1.0/src/egt/count_unique_changes_per_branch.py +117 -0
- egt-0.1.0/src/egt/defining_features.py +282 -0
- egt-0.1.0/src/egt/defining_features_plot.py +204 -0
- egt-0.1.0/src/egt/defining_features_plotRBH.py +700 -0
- egt-0.1.0/src/egt/fourier_of_rates.py +840 -0
- egt-0.1.0/src/egt/fourier_spectral_background.py +52 -0
- egt-0.1.0/src/egt/get_assembly_sizes.py +49 -0
- egt-0.1.0/src/egt/join_supplementary_tables.py +90 -0
- egt-0.1.0/src/egt/legacy/__init__.py +0 -0
- egt-0.1.0/src/egt/legacy/defining_features_plot2.py +406 -0
- egt-0.1.0/src/egt/legacy/plot_alg_fusions_v1.py +1382 -0
- egt-0.1.0/src/egt/legacy/plot_alg_fusions_v2.py +1349 -0
- egt-0.1.0/src/egt/newick_to_common_ancestors.py +3271 -0
- egt-0.1.0/src/egt/odol_annotate_blast.py +397 -0
- egt-0.1.0/src/egt/perspchrom_df_to_tree.py +2683 -0
- egt-0.1.0/src/egt/phylotreeumap.py +4551 -0
- egt-0.1.0/src/egt/phylotreeumap_plotdfs.py +2023 -0
- egt-0.1.0/src/egt/phylotreeumap_subsample.py +693 -0
- egt-0.1.0/src/egt/phylotreeumap_testpixels.py +139 -0
- egt-0.1.0/src/egt/plot_alg_dispersion.py +466 -0
- egt-0.1.0/src/egt/plot_alg_fusions.py +3208 -0
- egt-0.1.0/src/egt/plot_branch_stats_advanced.py +305 -0
- egt-0.1.0/src/egt/plot_branch_stats_tree.py +346 -0
- egt-0.1.0/src/egt/plot_branch_stats_tree_pair.py +238 -0
- egt-0.1.0/src/egt/plot_branch_stats_vs_time.py +2905 -0
- egt-0.1.0/src/egt/plot_chrom_number_vs_changes.py +689 -0
- egt-0.1.0/src/egt/plot_collapsed_tree.py +319 -0
- egt-0.1.0/src/egt/plot_decay_many_species.py +437 -0
- egt-0.1.0/src/egt/plot_decay_pairwise_steps.py +1466 -0
- egt-0.1.0/src/egt/plot_fourier_support_vs_time.py +134 -0
- egt-0.1.0/src/egt/plot_tree_changes.py +111 -0
- egt-0.1.0/src/egt/pull_entries_from_yaml.py +108 -0
- egt-0.1.0/src/egt/rbh_tools.py +441 -0
- egt-0.1.0/src/egt/taxid_tools.py +75 -0
- egt-0.1.0/src/egt/taxids_to_newick.py +782 -0
- egt-0.1.0/src/egt.egg-info/PKG-INFO +285 -0
- egt-0.1.0/src/egt.egg-info/SOURCES.txt +54 -0
- egt-0.1.0/src/egt.egg-info/dependency_links.txt +1 -0
- egt-0.1.0/src/egt.egg-info/entry_points.txt +2 -0
- egt-0.1.0/src/egt.egg-info/requires.txt +16 -0
- egt-0.1.0/src/egt.egg-info/top_level.txt +1 -0
- egt-0.1.0/tests/test_phylotreeumap.py +81 -0
egt-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Darrin T. Schultz
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
egt-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: egt
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Evolutionary Genome Topology — analysis toolkit for chromosome evolution across metazoan genomes using reciprocal-best-hits data.
|
|
5
|
+
Author-email: "Darrin T. Schultz" <darrin.schultz@univie.ac.at>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/conchoecia/egt
|
|
8
|
+
Project-URL: Source code, https://github.com/conchoecia/egt
|
|
9
|
+
Project-URL: Manuscript, https://github.com/conchoecia/egt/blob/main/docs/manuscript.md
|
|
10
|
+
Project-URL: odp, https://github.com/conchoecia/odp
|
|
11
|
+
Keywords: comparative genomics,chromosome evolution,ancestral linkage groups,ALG,synteny,UMAP,phylogenetics
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: POSIX
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: numpy
|
|
25
|
+
Requires-Dist: pandas
|
|
26
|
+
Requires-Dist: scipy
|
|
27
|
+
Requires-Dist: scikit-learn
|
|
28
|
+
Requires-Dist: matplotlib
|
|
29
|
+
Requires-Dist: networkx
|
|
30
|
+
Requires-Dist: Pillow
|
|
31
|
+
Requires-Dist: umap-learn[plot]
|
|
32
|
+
Requires-Dist: bokeh
|
|
33
|
+
Requires-Dist: ete4
|
|
34
|
+
Requires-Dist: snakemake<9,>=7
|
|
35
|
+
Requires-Dist: pyyaml
|
|
36
|
+
Provides-Extra: dev
|
|
37
|
+
Requires-Dist: pytest; extra == "dev"
|
|
38
|
+
Requires-Dist: pyflakes; extra == "dev"
|
|
39
|
+
Dynamic: license-file
|
|
40
|
+
|
|
41
|
+
# egt — Evolutionary Genome Topology
|
|
42
|
+
|
|
43
|
+
`egt` is a Python / Snakemake analysis toolkit for characterizing
|
|
44
|
+
chromosome evolution across metazoan genomes. It builds on reciprocal-best-hits
|
|
45
|
+
data from [`odp`](https://github.com/conchoecia/odp) and provides tools for:
|
|
46
|
+
|
|
47
|
+
- ALG (ancestral linkage group) fusion, dispersal, and rate analyses
|
|
48
|
+
- PhyloTreeUMAP: manifold projection of per-species ALG state (MGT, MLT, and
|
|
49
|
+
one-dot-one-genome variants)
|
|
50
|
+
- perspective-chromosome reconstruction with Monte Carlo support
|
|
51
|
+
- branch-wise rate analyses against a calibrated tree
|
|
52
|
+
- Fourier-period analysis of rate time series
|
|
53
|
+
- phylogenetic subsampling, tree prep, taxonomy utilities
|
|
54
|
+
|
|
55
|
+
## Getting Started
|
|
56
|
+
|
|
57
|
+
```sh
|
|
58
|
+
git clone https://github.com/conchoecia/egt.git
|
|
59
|
+
cd egt
|
|
60
|
+
python -m venv .venv && source .venv/bin/activate
|
|
61
|
+
pip install -e .
|
|
62
|
+
|
|
63
|
+
egt --help
|
|
64
|
+
bash tests/smoke/test_cli.sh
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Primary input is a directory of per-species RBH files produced by `odp`
|
|
68
|
+
against the BCnS ALG database. From there, most analyses are a single
|
|
69
|
+
`egt <subcommand>` call or a Snakefile under `workflows/`.
|
|
70
|
+
|
|
71
|
+
## Quick Start
|
|
72
|
+
|
|
73
|
+
### PhyloTreeUMAP — manifold projection of per-species ALG state
|
|
74
|
+
|
|
75
|
+
```sh
|
|
76
|
+
# 1. build per-sample distance matrices + sampledf
|
|
77
|
+
egt phylotreeumap build-distances \
|
|
78
|
+
--rbh-dir /path/to/rbh_files \
|
|
79
|
+
--alg-name BCnSSimakov2022 \
|
|
80
|
+
--sampledf-out GTUMAP/sampledf.tsv \
|
|
81
|
+
--distance-dir GTUMAP/distance_matrices/
|
|
82
|
+
|
|
83
|
+
# 2. index ALG locus pairs
|
|
84
|
+
egt phylotreeumap algcomboix \
|
|
85
|
+
--alg-rbh /path/to/LG_db/BCnSSimakov2022/BCnSSimakov2022.rbh \
|
|
86
|
+
--output GTUMAP/alg_combo_to_ix.tsv
|
|
87
|
+
|
|
88
|
+
# 3. run the UMAP + HTML plot (MGT / MLT / ODOG variants)
|
|
89
|
+
egt phylotreeumap mgt-mlt-umap --help
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### ALG fusion analysis on a calibrated tree
|
|
93
|
+
|
|
94
|
+
```sh
|
|
95
|
+
egt alg-fusions --help
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### Perspective-chromosome tree mapping + Monte Carlo rates
|
|
99
|
+
|
|
100
|
+
```sh
|
|
101
|
+
egt perspchrom-df-to-tree --help
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### Rate analyses, Fourier periodicity, branch stats
|
|
105
|
+
|
|
106
|
+
```sh
|
|
107
|
+
egt branch-stats-vs-time --help
|
|
108
|
+
egt fourier-of-rates --help
|
|
109
|
+
egt fourier-support-vs-time --help
|
|
110
|
+
egt collapsed-tree --help
|
|
111
|
+
egt tree-changes --help
|
|
112
|
+
egt decay-pairwise --help
|
|
113
|
+
egt decay-many-species --help
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
### Phylogeny preparation
|
|
117
|
+
|
|
118
|
+
```sh
|
|
119
|
+
egt taxids-to-newick --help
|
|
120
|
+
egt newick-to-common-ancestors --help
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## Users' Guide
|
|
124
|
+
|
|
125
|
+
`egt` is a collection of analysis scripts rather than a monolithic pipeline.
|
|
126
|
+
Each script is also registered as a subcommand of the `egt` console script:
|
|
127
|
+
|
|
128
|
+
```sh
|
|
129
|
+
egt alg-fusions --help
|
|
130
|
+
# equivalent to
|
|
131
|
+
python -m egt.plot_alg_fusions --help
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
### Installation
|
|
135
|
+
|
|
136
|
+
```sh
|
|
137
|
+
git clone https://github.com/conchoecia/egt.git
|
|
138
|
+
cd egt
|
|
139
|
+
python -m venv .venv && source .venv/bin/activate
|
|
140
|
+
pip install -e .
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### Python requirements
|
|
144
|
+
|
|
145
|
+
Python 3.10 or newer. `pip install -e .` pulls the deps from `pyproject.toml`:
|
|
146
|
+
|
|
147
|
+
- numpy, pandas, scipy, scikit-learn, matplotlib, networkx, Pillow
|
|
148
|
+
- umap-learn[plot] — UMAP + the plotting extras needed by PhyloTreeUMAP
|
|
149
|
+
- bokeh — interactive HTML plots
|
|
150
|
+
- ete4 — taxonomy trees and NCBI taxid handling
|
|
151
|
+
- snakemake (>=7, <9)
|
|
152
|
+
- pyyaml
|
|
153
|
+
|
|
154
|
+
Conda equivalent:
|
|
155
|
+
|
|
156
|
+
```sh
|
|
157
|
+
mamba install -c conda-forge -c bioconda \
|
|
158
|
+
python=3.11 numpy pandas scipy scikit-learn matplotlib networkx pillow \
|
|
159
|
+
"umap-learn" bokeh ete4 "snakemake<9" pyyaml
|
|
160
|
+
pip install --no-deps -e .
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
### Upstream tools
|
|
164
|
+
|
|
165
|
+
`egt` consumes outputs of several companion tools:
|
|
166
|
+
|
|
167
|
+
- [`odp`](https://github.com/conchoecia/odp) — per-species RBH files, ALG
|
|
168
|
+
databases (BCnSSimakov2022 etc.)
|
|
169
|
+
- [`chrombase`](https://github.com/conchoecia/chrombase) — chromosome-scale
|
|
170
|
+
NCBI genome database builder
|
|
171
|
+
- [`genbargo`](https://github.com/conchoecia/genbargo) — embargo-aware
|
|
172
|
+
assembly curation
|
|
173
|
+
- [`chromsim`](https://github.com/conchoecia/chromsim) — chromosome-evolution
|
|
174
|
+
simulations
|
|
175
|
+
|
|
176
|
+
### CLI overview
|
|
177
|
+
|
|
178
|
+
```
|
|
179
|
+
phylotreeumap — UMAP-over-ALG-topology (MGT, MLT, ODOG subcommands)
|
|
180
|
+
phylotreeumap-subsample — subsample species phylogenetically with per-clade caps
|
|
181
|
+
alg-fusions — plot fusion events on a phylogeny (canonical v3)
|
|
182
|
+
alg-dispersion — plot ALG dispersion across species
|
|
183
|
+
perspchrom-df-to-tree — map perspective-chromosome changes onto a tree (Monte Carlo)
|
|
184
|
+
decay-pairwise — pairwise ALG-decay analysis
|
|
185
|
+
decay-many-species — cross-species ALG conservation / decay
|
|
186
|
+
chrom-number-vs-changes — chromosome count vs rearrangement-rate scatter
|
|
187
|
+
branch-stats-vs-time — branch statistics against geologic time
|
|
188
|
+
branch-stats-tree — branch statistics laid out on a tree
|
|
189
|
+
branch-stats-tree-pair — paired branch-stats tree plots
|
|
190
|
+
collapsed-tree — collapsed-tree visualization
|
|
191
|
+
tree-changes — per-branch changes on a tree
|
|
192
|
+
fourier-of-rates — Fourier analysis of chromosomal change rates
|
|
193
|
+
fourier-support-vs-time — Fourier-support-vs-time plots
|
|
194
|
+
count-unique-changes — count unique changes per branch
|
|
195
|
+
defining-features — identify clade-defining features
|
|
196
|
+
defining-features-plot — plot defining features
|
|
197
|
+
defining-features-plotRBH — plot defining features on RBH dataframes
|
|
198
|
+
taxids-to-newick — build a Newick tree from NCBI taxids
|
|
199
|
+
newick-to-common-ancestors — divergence-time annotation from a timetree
|
|
200
|
+
algs-split-across-scaffolds — find ALGs split across scaffolds
|
|
201
|
+
get-assembly-sizes — summarize assembly sizes
|
|
202
|
+
pull-entries-from-yaml — select rows from a YAML sample list
|
|
203
|
+
aggregate-filechecker — aggregate filechecker benchmarks
|
|
204
|
+
aggregate-filesizes — aggregate file-size summaries
|
|
205
|
+
join-supplementary-tables — join table fragments
|
|
206
|
+
phylotreeumap-plotdfs — PhyloTreeUMAP plotting dataframe helper
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
### Snakemake workflows
|
|
210
|
+
|
|
211
|
+
Multi-stage Snakemake definitions live under `workflows/`:
|
|
212
|
+
|
|
213
|
+
```
|
|
214
|
+
workflows/
|
|
215
|
+
├── phylotree_umap.smk
|
|
216
|
+
├── phylotree_umap_subsampling.smk
|
|
217
|
+
├── perspchrom_df_stats_and_mc.smk
|
|
218
|
+
├── annotate_sample_df.smk
|
|
219
|
+
├── sample_to_num_chromosomes.smk
|
|
220
|
+
├── odol_annotate_blast.smk
|
|
221
|
+
└── pipeline/
|
|
222
|
+
├── README.md
|
|
223
|
+
├── config.template.yaml
|
|
224
|
+
└── run.sh
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
Each workflow is standalone and parameterized via a YAML config.
|
|
228
|
+
|
|
229
|
+
### Input file formats
|
|
230
|
+
|
|
231
|
+
- **RBH files** (`.rbh`) — tab-separated reciprocal-best-hits output of `odp`.
|
|
232
|
+
Filenames must embed the NCBI taxid as the second hyphen-separated field,
|
|
233
|
+
e.g. `speciesname-7777-something.rbh`.
|
|
234
|
+
- **Sample dataframe** (`sampledf.tsv`) — output of
|
|
235
|
+
`egt phylotreeumap build-distances`; consumed by most downstream commands.
|
|
236
|
+
- **ALG database RBH** — e.g. `BCnSSimakov2022.rbh`, from `odp`'s LG_db.
|
|
237
|
+
- **Newick trees** — ete4-readable. `egt taxids-to-newick` emits these.
|
|
238
|
+
- **Divergence-time tables** — TSV, as accepted by
|
|
239
|
+
`egt newick-to-common-ancestors`.
|
|
240
|
+
|
|
241
|
+
## Layout
|
|
242
|
+
|
|
243
|
+
```
|
|
244
|
+
egt/
|
|
245
|
+
├── src/egt/ — Python package
|
|
246
|
+
│ ├── cli.py — argparse dispatcher
|
|
247
|
+
│ ├── _vendor/ — vendored, frozen plotting utilities
|
|
248
|
+
│ ├── legacy/ — prior versions of plot_ALG_fusions kept for parity
|
|
249
|
+
│ └── *.py — one module per subcommand
|
|
250
|
+
├── workflows/ — Snakemake workflows
|
|
251
|
+
├── configs/ — example configs
|
|
252
|
+
├── data/ — small bundled data
|
|
253
|
+
├── tests/
|
|
254
|
+
│ ├── testdb/ — mini_hydra + mini_urchin fixtures
|
|
255
|
+
│ └── smoke/test_cli.sh — CLI smoke test
|
|
256
|
+
└── docs/
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
## Related tools
|
|
260
|
+
|
|
261
|
+
- [`odp`](https://github.com/conchoecia/odp)
|
|
262
|
+
- [`chrombase`](https://github.com/conchoecia/chrombase)
|
|
263
|
+
- [`genbargo`](https://github.com/conchoecia/genbargo)
|
|
264
|
+
- [`chromsim`](https://github.com/conchoecia/chromsim)
|
|
265
|
+
|
|
266
|
+
## Citing egt
|
|
267
|
+
|
|
268
|
+
If you use this toolkit, please cite:
|
|
269
|
+
|
|
270
|
+
> Schultz, D.T., Blümel, A., Destanović, D., Sarigol, F., Simakov, O. (2024).
|
|
271
|
+
> *Topological mixing and irreversibility in animal chromosome evolution.*
|
|
272
|
+
> bioRxiv. [doi:10.1101/2024.07.29.605683](https://doi.org/10.1101/2024.07.29.605683)
|
|
273
|
+
|
|
274
|
+
For background on the topological framework for comparative genomics, see:
|
|
275
|
+
|
|
276
|
+
> Schultz, D.T., Simakov, O. (2026).
|
|
277
|
+
> *Topological Approaches in Animal Comparative Genomics.*
|
|
278
|
+
> Annual Review of Animal Biosciences 14(1), 17–48.
|
|
279
|
+
> [doi:10.1146/annurev-animal-030424-084541](https://doi.org/10.1146/annurev-animal-030424-084541)
|
|
280
|
+
|
|
281
|
+
See also [`CITATION.cff`](CITATION.cff).
|
|
282
|
+
|
|
283
|
+
## License
|
|
284
|
+
|
|
285
|
+
MIT — see [`LICENSE`](LICENSE).
|
egt-0.1.0/README.md
ADDED
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
# egt — Evolutionary Genome Topology
|
|
2
|
+
|
|
3
|
+
`egt` is a Python / Snakemake analysis toolkit for characterizing
|
|
4
|
+
chromosome evolution across metazoan genomes. It builds on reciprocal-best-hits
|
|
5
|
+
data from [`odp`](https://github.com/conchoecia/odp) and provides tools for:
|
|
6
|
+
|
|
7
|
+
- ALG (ancestral linkage group) fusion, dispersal, and rate analyses
|
|
8
|
+
- PhyloTreeUMAP: manifold projection of per-species ALG state (MGT, MLT, and
|
|
9
|
+
one-dot-one-genome variants)
|
|
10
|
+
- perspective-chromosome reconstruction with Monte Carlo support
|
|
11
|
+
- branch-wise rate analyses against a calibrated tree
|
|
12
|
+
- Fourier-period analysis of rate time series
|
|
13
|
+
- phylogenetic subsampling, tree prep, taxonomy utilities
|
|
14
|
+
|
|
15
|
+
## Getting Started
|
|
16
|
+
|
|
17
|
+
```sh
|
|
18
|
+
git clone https://github.com/conchoecia/egt.git
|
|
19
|
+
cd egt
|
|
20
|
+
python -m venv .venv && source .venv/bin/activate
|
|
21
|
+
pip install -e .
|
|
22
|
+
|
|
23
|
+
egt --help
|
|
24
|
+
bash tests/smoke/test_cli.sh
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
Primary input is a directory of per-species RBH files produced by `odp`
|
|
28
|
+
against the BCnS ALG database. From there, most analyses are a single
|
|
29
|
+
`egt <subcommand>` call or a Snakefile under `workflows/`.
|
|
30
|
+
|
|
31
|
+
## Quick Start
|
|
32
|
+
|
|
33
|
+
### PhyloTreeUMAP — manifold projection of per-species ALG state
|
|
34
|
+
|
|
35
|
+
```sh
|
|
36
|
+
# 1. build per-sample distance matrices + sampledf
|
|
37
|
+
egt phylotreeumap build-distances \
|
|
38
|
+
--rbh-dir /path/to/rbh_files \
|
|
39
|
+
--alg-name BCnSSimakov2022 \
|
|
40
|
+
--sampledf-out GTUMAP/sampledf.tsv \
|
|
41
|
+
--distance-dir GTUMAP/distance_matrices/
|
|
42
|
+
|
|
43
|
+
# 2. index ALG locus pairs
|
|
44
|
+
egt phylotreeumap algcomboix \
|
|
45
|
+
--alg-rbh /path/to/LG_db/BCnSSimakov2022/BCnSSimakov2022.rbh \
|
|
46
|
+
--output GTUMAP/alg_combo_to_ix.tsv
|
|
47
|
+
|
|
48
|
+
# 3. run the UMAP + HTML plot (MGT / MLT / ODOG variants)
|
|
49
|
+
egt phylotreeumap mgt-mlt-umap --help
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### ALG fusion analysis on a calibrated tree
|
|
53
|
+
|
|
54
|
+
```sh
|
|
55
|
+
egt alg-fusions --help
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### Perspective-chromosome tree mapping + Monte Carlo rates
|
|
59
|
+
|
|
60
|
+
```sh
|
|
61
|
+
egt perspchrom-df-to-tree --help
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Rate analyses, Fourier periodicity, branch stats
|
|
65
|
+
|
|
66
|
+
```sh
|
|
67
|
+
egt branch-stats-vs-time --help
|
|
68
|
+
egt fourier-of-rates --help
|
|
69
|
+
egt fourier-support-vs-time --help
|
|
70
|
+
egt collapsed-tree --help
|
|
71
|
+
egt tree-changes --help
|
|
72
|
+
egt decay-pairwise --help
|
|
73
|
+
egt decay-many-species --help
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### Phylogeny preparation
|
|
77
|
+
|
|
78
|
+
```sh
|
|
79
|
+
egt taxids-to-newick --help
|
|
80
|
+
egt newick-to-common-ancestors --help
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Users' Guide
|
|
84
|
+
|
|
85
|
+
`egt` is a collection of analysis scripts rather than a monolithic pipeline.
|
|
86
|
+
Each script is also registered as a subcommand of the `egt` console script:
|
|
87
|
+
|
|
88
|
+
```sh
|
|
89
|
+
egt alg-fusions --help
|
|
90
|
+
# equivalent to
|
|
91
|
+
python -m egt.plot_alg_fusions --help
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### Installation
|
|
95
|
+
|
|
96
|
+
```sh
|
|
97
|
+
git clone https://github.com/conchoecia/egt.git
|
|
98
|
+
cd egt
|
|
99
|
+
python -m venv .venv && source .venv/bin/activate
|
|
100
|
+
pip install -e .
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Python requirements
|
|
104
|
+
|
|
105
|
+
Python 3.10 or newer. `pip install -e .` pulls the deps from `pyproject.toml`:
|
|
106
|
+
|
|
107
|
+
- numpy, pandas, scipy, scikit-learn, matplotlib, networkx, Pillow
|
|
108
|
+
- umap-learn[plot] — UMAP + the plotting extras needed by PhyloTreeUMAP
|
|
109
|
+
- bokeh — interactive HTML plots
|
|
110
|
+
- ete4 — taxonomy trees and NCBI taxid handling
|
|
111
|
+
- snakemake (>=7, <9)
|
|
112
|
+
- pyyaml
|
|
113
|
+
|
|
114
|
+
Conda equivalent:
|
|
115
|
+
|
|
116
|
+
```sh
|
|
117
|
+
mamba install -c conda-forge -c bioconda \
|
|
118
|
+
python=3.11 numpy pandas scipy scikit-learn matplotlib networkx pillow \
|
|
119
|
+
"umap-learn" bokeh ete4 "snakemake<9" pyyaml
|
|
120
|
+
pip install --no-deps -e .
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### Upstream tools
|
|
124
|
+
|
|
125
|
+
`egt` consumes outputs of several companion tools:
|
|
126
|
+
|
|
127
|
+
- [`odp`](https://github.com/conchoecia/odp) — per-species RBH files, ALG
|
|
128
|
+
databases (BCnSSimakov2022 etc.)
|
|
129
|
+
- [`chrombase`](https://github.com/conchoecia/chrombase) — chromosome-scale
|
|
130
|
+
NCBI genome database builder
|
|
131
|
+
- [`genbargo`](https://github.com/conchoecia/genbargo) — embargo-aware
|
|
132
|
+
assembly curation
|
|
133
|
+
- [`chromsim`](https://github.com/conchoecia/chromsim) — chromosome-evolution
|
|
134
|
+
simulations
|
|
135
|
+
|
|
136
|
+
### CLI overview
|
|
137
|
+
|
|
138
|
+
```
|
|
139
|
+
phylotreeumap — UMAP-over-ALG-topology (MGT, MLT, ODOG subcommands)
|
|
140
|
+
phylotreeumap-subsample — subsample species phylogenetically with per-clade caps
|
|
141
|
+
alg-fusions — plot fusion events on a phylogeny (canonical v3)
|
|
142
|
+
alg-dispersion — plot ALG dispersion across species
|
|
143
|
+
perspchrom-df-to-tree — map perspective-chromosome changes onto a tree (Monte Carlo)
|
|
144
|
+
decay-pairwise — pairwise ALG-decay analysis
|
|
145
|
+
decay-many-species — cross-species ALG conservation / decay
|
|
146
|
+
chrom-number-vs-changes — chromosome count vs rearrangement-rate scatter
|
|
147
|
+
branch-stats-vs-time — branch statistics against geologic time
|
|
148
|
+
branch-stats-tree — branch statistics laid out on a tree
|
|
149
|
+
branch-stats-tree-pair — paired branch-stats tree plots
|
|
150
|
+
collapsed-tree — collapsed-tree visualization
|
|
151
|
+
tree-changes — per-branch changes on a tree
|
|
152
|
+
fourier-of-rates — Fourier analysis of chromosomal change rates
|
|
153
|
+
fourier-support-vs-time — Fourier-support-vs-time plots
|
|
154
|
+
count-unique-changes — count unique changes per branch
|
|
155
|
+
defining-features — identify clade-defining features
|
|
156
|
+
defining-features-plot — plot defining features
|
|
157
|
+
defining-features-plotRBH — plot defining features on RBH dataframes
|
|
158
|
+
taxids-to-newick — build a Newick tree from NCBI taxids
|
|
159
|
+
newick-to-common-ancestors — divergence-time annotation from a timetree
|
|
160
|
+
algs-split-across-scaffolds — find ALGs split across scaffolds
|
|
161
|
+
get-assembly-sizes — summarize assembly sizes
|
|
162
|
+
pull-entries-from-yaml — select rows from a YAML sample list
|
|
163
|
+
aggregate-filechecker — aggregate filechecker benchmarks
|
|
164
|
+
aggregate-filesizes — aggregate file-size summaries
|
|
165
|
+
join-supplementary-tables — join table fragments
|
|
166
|
+
phylotreeumap-plotdfs — PhyloTreeUMAP plotting dataframe helper
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
### Snakemake workflows
|
|
170
|
+
|
|
171
|
+
Multi-stage Snakemake definitions live under `workflows/`:
|
|
172
|
+
|
|
173
|
+
```
|
|
174
|
+
workflows/
|
|
175
|
+
├── phylotree_umap.smk
|
|
176
|
+
├── phylotree_umap_subsampling.smk
|
|
177
|
+
├── perspchrom_df_stats_and_mc.smk
|
|
178
|
+
├── annotate_sample_df.smk
|
|
179
|
+
├── sample_to_num_chromosomes.smk
|
|
180
|
+
├── odol_annotate_blast.smk
|
|
181
|
+
└── pipeline/
|
|
182
|
+
├── README.md
|
|
183
|
+
├── config.template.yaml
|
|
184
|
+
└── run.sh
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
Each workflow is standalone and parameterized via a YAML config.
|
|
188
|
+
|
|
189
|
+
### Input file formats
|
|
190
|
+
|
|
191
|
+
- **RBH files** (`.rbh`) — tab-separated reciprocal-best-hits output of `odp`.
|
|
192
|
+
Filenames must embed the NCBI taxid as the second hyphen-separated field,
|
|
193
|
+
e.g. `speciesname-7777-something.rbh`.
|
|
194
|
+
- **Sample dataframe** (`sampledf.tsv`) — output of
|
|
195
|
+
`egt phylotreeumap build-distances`; consumed by most downstream commands.
|
|
196
|
+
- **ALG database RBH** — e.g. `BCnSSimakov2022.rbh`, from `odp`'s LG_db.
|
|
197
|
+
- **Newick trees** — ete4-readable. `egt taxids-to-newick` emits these.
|
|
198
|
+
- **Divergence-time tables** — TSV, as accepted by
|
|
199
|
+
`egt newick-to-common-ancestors`.
|
|
200
|
+
|
|
201
|
+
## Layout
|
|
202
|
+
|
|
203
|
+
```
|
|
204
|
+
egt/
|
|
205
|
+
├── src/egt/ — Python package
|
|
206
|
+
│ ├── cli.py — argparse dispatcher
|
|
207
|
+
│ ├── _vendor/ — vendored, frozen plotting utilities
|
|
208
|
+
│ ├── legacy/ — prior versions of plot_ALG_fusions kept for parity
|
|
209
|
+
│ └── *.py — one module per subcommand
|
|
210
|
+
├── workflows/ — Snakemake workflows
|
|
211
|
+
├── configs/ — example configs
|
|
212
|
+
├── data/ — small bundled data
|
|
213
|
+
├── tests/
|
|
214
|
+
│ ├── testdb/ — mini_hydra + mini_urchin fixtures
|
|
215
|
+
│ └── smoke/test_cli.sh — CLI smoke test
|
|
216
|
+
└── docs/
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
## Related tools
|
|
220
|
+
|
|
221
|
+
- [`odp`](https://github.com/conchoecia/odp)
|
|
222
|
+
- [`chrombase`](https://github.com/conchoecia/chrombase)
|
|
223
|
+
- [`genbargo`](https://github.com/conchoecia/genbargo)
|
|
224
|
+
- [`chromsim`](https://github.com/conchoecia/chromsim)
|
|
225
|
+
|
|
226
|
+
## Citing egt
|
|
227
|
+
|
|
228
|
+
If you use this toolkit, please cite:
|
|
229
|
+
|
|
230
|
+
> Schultz, D.T., Blümel, A., Destanović, D., Sarigol, F., Simakov, O. (2024).
|
|
231
|
+
> *Topological mixing and irreversibility in animal chromosome evolution.*
|
|
232
|
+
> bioRxiv. [doi:10.1101/2024.07.29.605683](https://doi.org/10.1101/2024.07.29.605683)
|
|
233
|
+
|
|
234
|
+
For background on the topological framework for comparative genomics, see:
|
|
235
|
+
|
|
236
|
+
> Schultz, D.T., Simakov, O. (2026).
|
|
237
|
+
> *Topological Approaches in Animal Comparative Genomics.*
|
|
238
|
+
> Annual Review of Animal Biosciences 14(1), 17–48.
|
|
239
|
+
> [doi:10.1146/annurev-animal-030424-084541](https://doi.org/10.1146/annurev-animal-030424-084541)
|
|
240
|
+
|
|
241
|
+
See also [`CITATION.cff`](CITATION.cff).
|
|
242
|
+
|
|
243
|
+
## License
|
|
244
|
+
|
|
245
|
+
MIT — see [`LICENSE`](LICENSE).
|
egt-0.1.0/pyproject.toml
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=64", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "egt"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Evolutionary Genome Topology — analysis toolkit for chromosome evolution across metazoan genomes using reciprocal-best-hits data."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Darrin T. Schultz", email = "darrin.schultz@univie.ac.at" },
|
|
14
|
+
]
|
|
15
|
+
keywords = [
|
|
16
|
+
"comparative genomics",
|
|
17
|
+
"chromosome evolution",
|
|
18
|
+
"ancestral linkage groups",
|
|
19
|
+
"ALG",
|
|
20
|
+
"synteny",
|
|
21
|
+
"UMAP",
|
|
22
|
+
"phylogenetics",
|
|
23
|
+
]
|
|
24
|
+
classifiers = [
|
|
25
|
+
"Development Status :: 4 - Beta",
|
|
26
|
+
"Intended Audience :: Science/Research",
|
|
27
|
+
"License :: OSI Approved :: MIT License",
|
|
28
|
+
"Operating System :: POSIX",
|
|
29
|
+
"Programming Language :: Python :: 3",
|
|
30
|
+
"Programming Language :: Python :: 3.10",
|
|
31
|
+
"Programming Language :: Python :: 3.11",
|
|
32
|
+
"Programming Language :: Python :: 3.12",
|
|
33
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
34
|
+
]
|
|
35
|
+
dependencies = [
|
|
36
|
+
"numpy",
|
|
37
|
+
"pandas",
|
|
38
|
+
"scipy",
|
|
39
|
+
"scikit-learn",
|
|
40
|
+
"matplotlib",
|
|
41
|
+
"networkx",
|
|
42
|
+
"Pillow",
|
|
43
|
+
"umap-learn[plot]",
|
|
44
|
+
"bokeh",
|
|
45
|
+
"ete4",
|
|
46
|
+
"snakemake>=7,<9",
|
|
47
|
+
"pyyaml",
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
[project.optional-dependencies]
|
|
51
|
+
dev = ["pytest", "pyflakes"]
|
|
52
|
+
|
|
53
|
+
[project.urls]
|
|
54
|
+
Homepage = "https://github.com/conchoecia/egt"
|
|
55
|
+
"Source code" = "https://github.com/conchoecia/egt"
|
|
56
|
+
"Manuscript" = "https://github.com/conchoecia/egt/blob/main/docs/manuscript.md"
|
|
57
|
+
"odp" = "https://github.com/conchoecia/odp"
|
|
58
|
+
|
|
59
|
+
[project.scripts]
|
|
60
|
+
egt = "egt.cli:main"
|
|
61
|
+
|
|
62
|
+
[tool.setuptools.packages.find]
|
|
63
|
+
where = ["src"]
|
|
64
|
+
|
|
65
|
+
[tool.setuptools.package-data]
|
|
66
|
+
egt = ["py.typed"]
|
egt-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
|
File without changes
|