egt 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of egt might be problematic. Click here for more details.

Files changed (56) hide show
  1. egt-0.1.0/LICENSE +21 -0
  2. egt-0.1.0/PKG-INFO +285 -0
  3. egt-0.1.0/README.md +245 -0
  4. egt-0.1.0/pyproject.toml +66 -0
  5. egt-0.1.0/setup.cfg +4 -0
  6. egt-0.1.0/src/egt/__init__.py +1 -0
  7. egt-0.1.0/src/egt/_vendor/__init__.py +0 -0
  8. egt-0.1.0/src/egt/_vendor/odp_plotting_functions.py +81 -0
  9. egt-0.1.0/src/egt/aggregate_filechecker_benchmarks.py +93 -0
  10. egt-0.1.0/src/egt/aggregate_filesizes.py +50 -0
  11. egt-0.1.0/src/egt/algs_split_across_scaffolds.py +204 -0
  12. egt-0.1.0/src/egt/annotate_sample_df.py +628 -0
  13. egt-0.1.0/src/egt/bokeh_helper.py +33 -0
  14. egt-0.1.0/src/egt/cli.py +76 -0
  15. egt-0.1.0/src/egt/count_unique_changes_per_branch.py +117 -0
  16. egt-0.1.0/src/egt/defining_features.py +282 -0
  17. egt-0.1.0/src/egt/defining_features_plot.py +204 -0
  18. egt-0.1.0/src/egt/defining_features_plotRBH.py +700 -0
  19. egt-0.1.0/src/egt/fourier_of_rates.py +840 -0
  20. egt-0.1.0/src/egt/fourier_spectral_background.py +52 -0
  21. egt-0.1.0/src/egt/get_assembly_sizes.py +49 -0
  22. egt-0.1.0/src/egt/join_supplementary_tables.py +90 -0
  23. egt-0.1.0/src/egt/legacy/__init__.py +0 -0
  24. egt-0.1.0/src/egt/legacy/defining_features_plot2.py +406 -0
  25. egt-0.1.0/src/egt/legacy/plot_alg_fusions_v1.py +1382 -0
  26. egt-0.1.0/src/egt/legacy/plot_alg_fusions_v2.py +1349 -0
  27. egt-0.1.0/src/egt/newick_to_common_ancestors.py +3271 -0
  28. egt-0.1.0/src/egt/odol_annotate_blast.py +397 -0
  29. egt-0.1.0/src/egt/perspchrom_df_to_tree.py +2683 -0
  30. egt-0.1.0/src/egt/phylotreeumap.py +4551 -0
  31. egt-0.1.0/src/egt/phylotreeumap_plotdfs.py +2023 -0
  32. egt-0.1.0/src/egt/phylotreeumap_subsample.py +693 -0
  33. egt-0.1.0/src/egt/phylotreeumap_testpixels.py +139 -0
  34. egt-0.1.0/src/egt/plot_alg_dispersion.py +466 -0
  35. egt-0.1.0/src/egt/plot_alg_fusions.py +3208 -0
  36. egt-0.1.0/src/egt/plot_branch_stats_advanced.py +305 -0
  37. egt-0.1.0/src/egt/plot_branch_stats_tree.py +346 -0
  38. egt-0.1.0/src/egt/plot_branch_stats_tree_pair.py +238 -0
  39. egt-0.1.0/src/egt/plot_branch_stats_vs_time.py +2905 -0
  40. egt-0.1.0/src/egt/plot_chrom_number_vs_changes.py +689 -0
  41. egt-0.1.0/src/egt/plot_collapsed_tree.py +319 -0
  42. egt-0.1.0/src/egt/plot_decay_many_species.py +437 -0
  43. egt-0.1.0/src/egt/plot_decay_pairwise_steps.py +1466 -0
  44. egt-0.1.0/src/egt/plot_fourier_support_vs_time.py +134 -0
  45. egt-0.1.0/src/egt/plot_tree_changes.py +111 -0
  46. egt-0.1.0/src/egt/pull_entries_from_yaml.py +108 -0
  47. egt-0.1.0/src/egt/rbh_tools.py +441 -0
  48. egt-0.1.0/src/egt/taxid_tools.py +75 -0
  49. egt-0.1.0/src/egt/taxids_to_newick.py +782 -0
  50. egt-0.1.0/src/egt.egg-info/PKG-INFO +285 -0
  51. egt-0.1.0/src/egt.egg-info/SOURCES.txt +54 -0
  52. egt-0.1.0/src/egt.egg-info/dependency_links.txt +1 -0
  53. egt-0.1.0/src/egt.egg-info/entry_points.txt +2 -0
  54. egt-0.1.0/src/egt.egg-info/requires.txt +16 -0
  55. egt-0.1.0/src/egt.egg-info/top_level.txt +1 -0
  56. egt-0.1.0/tests/test_phylotreeumap.py +81 -0
egt-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Darrin T. Schultz
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
egt-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,285 @@
1
+ Metadata-Version: 2.4
2
+ Name: egt
3
+ Version: 0.1.0
4
+ Summary: Evolutionary Genome Topology — analysis toolkit for chromosome evolution across metazoan genomes using reciprocal-best-hits data.
5
+ Author-email: "Darrin T. Schultz" <darrin.schultz@univie.ac.at>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/conchoecia/egt
8
+ Project-URL: Source code, https://github.com/conchoecia/egt
9
+ Project-URL: Manuscript, https://github.com/conchoecia/egt/blob/main/docs/manuscript.md
10
+ Project-URL: odp, https://github.com/conchoecia/odp
11
+ Keywords: comparative genomics,chromosome evolution,ancestral linkage groups,ALG,synteny,UMAP,phylogenetics
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: POSIX
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
21
+ Requires-Python: >=3.10
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Requires-Dist: numpy
25
+ Requires-Dist: pandas
26
+ Requires-Dist: scipy
27
+ Requires-Dist: scikit-learn
28
+ Requires-Dist: matplotlib
29
+ Requires-Dist: networkx
30
+ Requires-Dist: Pillow
31
+ Requires-Dist: umap-learn[plot]
32
+ Requires-Dist: bokeh
33
+ Requires-Dist: ete4
34
+ Requires-Dist: snakemake<9,>=7
35
+ Requires-Dist: pyyaml
36
+ Provides-Extra: dev
37
+ Requires-Dist: pytest; extra == "dev"
38
+ Requires-Dist: pyflakes; extra == "dev"
39
+ Dynamic: license-file
40
+
41
+ # egt — Evolutionary Genome Topology
42
+
43
+ `egt` is a Python / Snakemake analysis toolkit for characterizing
44
+ chromosome evolution across metazoan genomes. It builds on reciprocal-best-hits
45
+ data from [`odp`](https://github.com/conchoecia/odp) and provides tools for:
46
+
47
+ - ALG (ancestral linkage group) fusion, dispersal, and rate analyses
48
+ - PhyloTreeUMAP: manifold projection of per-species ALG state (MGT, MLT, and
49
+ one-dot-one-genome variants)
50
+ - perspective-chromosome reconstruction with Monte Carlo support
51
+ - branch-wise rate analyses against a calibrated tree
52
+ - Fourier-period analysis of rate time series
53
+ - phylogenetic subsampling, tree prep, taxonomy utilities
54
+
55
+ ## Getting Started
56
+
57
+ ```sh
58
+ git clone https://github.com/conchoecia/egt.git
59
+ cd egt
60
+ python -m venv .venv && source .venv/bin/activate
61
+ pip install -e .
62
+
63
+ egt --help
64
+ bash tests/smoke/test_cli.sh
65
+ ```
66
+
67
+ Primary input is a directory of per-species RBH files produced by `odp`
68
+ against the BCnS ALG database. From there, most analyses are a single
69
+ `egt <subcommand>` call or a Snakefile under `workflows/`.
70
+
71
+ ## Quick Start
72
+
73
+ ### PhyloTreeUMAP — manifold projection of per-species ALG state
74
+
75
+ ```sh
76
+ # 1. build per-sample distance matrices + sampledf
77
+ egt phylotreeumap build-distances \
78
+ --rbh-dir /path/to/rbh_files \
79
+ --alg-name BCnSSimakov2022 \
80
+ --sampledf-out GTUMAP/sampledf.tsv \
81
+ --distance-dir GTUMAP/distance_matrices/
82
+
83
+ # 2. index ALG locus pairs
84
+ egt phylotreeumap algcomboix \
85
+ --alg-rbh /path/to/LG_db/BCnSSimakov2022/BCnSSimakov2022.rbh \
86
+ --output GTUMAP/alg_combo_to_ix.tsv
87
+
88
+ # 3. run the UMAP + HTML plot (MGT / MLT / ODOG variants)
89
+ egt phylotreeumap mgt-mlt-umap --help
90
+ ```
91
+
92
+ ### ALG fusion analysis on a calibrated tree
93
+
94
+ ```sh
95
+ egt alg-fusions --help
96
+ ```
97
+
98
+ ### Perspective-chromosome tree mapping + Monte Carlo rates
99
+
100
+ ```sh
101
+ egt perspchrom-df-to-tree --help
102
+ ```
103
+
104
+ ### Rate analyses, Fourier periodicity, branch stats
105
+
106
+ ```sh
107
+ egt branch-stats-vs-time --help
108
+ egt fourier-of-rates --help
109
+ egt fourier-support-vs-time --help
110
+ egt collapsed-tree --help
111
+ egt tree-changes --help
112
+ egt decay-pairwise --help
113
+ egt decay-many-species --help
114
+ ```
115
+
116
+ ### Phylogeny preparation
117
+
118
+ ```sh
119
+ egt taxids-to-newick --help
120
+ egt newick-to-common-ancestors --help
121
+ ```
122
+
123
+ ## Users' Guide
124
+
125
+ `egt` is a collection of analysis scripts rather than a monolithic pipeline.
126
+ Each script is also registered as a subcommand of the `egt` console script:
127
+
128
+ ```sh
129
+ egt alg-fusions --help
130
+ # equivalent to
131
+ python -m egt.plot_alg_fusions --help
132
+ ```
133
+
134
+ ### Installation
135
+
136
+ ```sh
137
+ git clone https://github.com/conchoecia/egt.git
138
+ cd egt
139
+ python -m venv .venv && source .venv/bin/activate
140
+ pip install -e .
141
+ ```
142
+
143
+ ### Python requirements
144
+
145
+ Python 3.10 or newer. `pip install -e .` pulls the deps from `pyproject.toml`:
146
+
147
+ - numpy, pandas, scipy, scikit-learn, matplotlib, networkx, Pillow
148
+ - umap-learn[plot] — UMAP + the plotting extras needed by PhyloTreeUMAP
149
+ - bokeh — interactive HTML plots
150
+ - ete4 — taxonomy trees and NCBI taxid handling
151
+ - snakemake (>=7, <9)
152
+ - pyyaml
153
+
154
+ Conda equivalent:
155
+
156
+ ```sh
157
+ mamba install -c conda-forge -c bioconda \
158
+ python=3.11 numpy pandas scipy scikit-learn matplotlib networkx pillow \
159
+ "umap-learn" bokeh ete4 "snakemake<9" pyyaml
160
+ pip install --no-deps -e .
161
+ ```
162
+
163
+ ### Upstream tools
164
+
165
+ `egt` consumes outputs of several companion tools:
166
+
167
+ - [`odp`](https://github.com/conchoecia/odp) — per-species RBH files, ALG
168
+ databases (BCnSSimakov2022 etc.)
169
+ - [`chrombase`](https://github.com/conchoecia/chrombase) — chromosome-scale
170
+ NCBI genome database builder
171
+ - [`genbargo`](https://github.com/conchoecia/genbargo) — embargo-aware
172
+ assembly curation
173
+ - [`chromsim`](https://github.com/conchoecia/chromsim) — chromosome-evolution
174
+ simulations
175
+
176
+ ### CLI overview
177
+
178
+ ```
179
+ phylotreeumap — UMAP-over-ALG-topology (MGT, MLT, ODOG subcommands)
180
+ phylotreeumap-subsample — subsample species phylogenetically with per-clade caps
181
+ alg-fusions — plot fusion events on a phylogeny (canonical v3)
182
+ alg-dispersion — plot ALG dispersion across species
183
+ perspchrom-df-to-tree — map perspective-chromosome changes onto a tree (Monte Carlo)
184
+ decay-pairwise — pairwise ALG-decay analysis
185
+ decay-many-species — cross-species ALG conservation / decay
186
+ chrom-number-vs-changes — chromosome count vs rearrangement-rate scatter
187
+ branch-stats-vs-time — branch statistics against geologic time
188
+ branch-stats-tree — branch statistics laid out on a tree
189
+ branch-stats-tree-pair — paired branch-stats tree plots
190
+ collapsed-tree — collapsed-tree visualization
191
+ tree-changes — per-branch changes on a tree
192
+ fourier-of-rates — Fourier analysis of chromosomal change rates
193
+ fourier-support-vs-time — Fourier-support-vs-time plots
194
+ count-unique-changes — count unique changes per branch
195
+ defining-features — identify clade-defining features
196
+ defining-features-plot — plot defining features
197
+ defining-features-plotRBH — plot defining features on RBH dataframes
198
+ taxids-to-newick — build a Newick tree from NCBI taxids
199
+ newick-to-common-ancestors — divergence-time annotation from a timetree
200
+ algs-split-across-scaffolds — find ALGs split across scaffolds
201
+ get-assembly-sizes — summarize assembly sizes
202
+ pull-entries-from-yaml — select rows from a YAML sample list
203
+ aggregate-filechecker — aggregate filechecker benchmarks
204
+ aggregate-filesizes — aggregate file-size summaries
205
+ join-supplementary-tables — join table fragments
206
+ phylotreeumap-plotdfs — PhyloTreeUMAP plotting dataframe helper
207
+ ```
208
+
209
+ ### Snakemake workflows
210
+
211
+ Multi-stage Snakemake definitions live under `workflows/`:
212
+
213
+ ```
214
+ workflows/
215
+ ├── phylotree_umap.smk
216
+ ├── phylotree_umap_subsampling.smk
217
+ ├── perspchrom_df_stats_and_mc.smk
218
+ ├── annotate_sample_df.smk
219
+ ├── sample_to_num_chromosomes.smk
220
+ ├── odol_annotate_blast.smk
221
+ └── pipeline/
222
+ ├── README.md
223
+ ├── config.template.yaml
224
+ └── run.sh
225
+ ```
226
+
227
+ Each workflow is standalone and parameterized via a YAML config.
228
+
229
+ ### Input file formats
230
+
231
+ - **RBH files** (`.rbh`) — tab-separated reciprocal-best-hits output of `odp`.
232
+ Filenames must embed the NCBI taxid as the second hyphen-separated field,
233
+ e.g. `speciesname-7777-something.rbh`.
234
+ - **Sample dataframe** (`sampledf.tsv`) — output of
235
+ `egt phylotreeumap build-distances`; consumed by most downstream commands.
236
+ - **ALG database RBH** — e.g. `BCnSSimakov2022.rbh`, from `odp`'s LG_db.
237
+ - **Newick trees** — ete4-readable. `egt taxids-to-newick` emits these.
238
+ - **Divergence-time tables** — TSV, as accepted by
239
+ `egt newick-to-common-ancestors`.
240
+
241
+ ## Layout
242
+
243
+ ```
244
+ egt/
245
+ ├── src/egt/ — Python package
246
+ │ ├── cli.py — argparse dispatcher
247
+ │ ├── _vendor/ — vendored, frozen plotting utilities
248
+ │ ├── legacy/ — prior versions of plot_ALG_fusions kept for parity
249
+ │ └── *.py — one module per subcommand
250
+ ├── workflows/ — Snakemake workflows
251
+ ├── configs/ — example configs
252
+ ├── data/ — small bundled data
253
+ ├── tests/
254
+ │ ├── testdb/ — mini_hydra + mini_urchin fixtures
255
+ │ └── smoke/test_cli.sh — CLI smoke test
256
+ └── docs/
257
+ ```
258
+
259
+ ## Related tools
260
+
261
+ - [`odp`](https://github.com/conchoecia/odp)
262
+ - [`chrombase`](https://github.com/conchoecia/chrombase)
263
+ - [`genbargo`](https://github.com/conchoecia/genbargo)
264
+ - [`chromsim`](https://github.com/conchoecia/chromsim)
265
+
266
+ ## Citing egt
267
+
268
+ If you use this toolkit, please cite:
269
+
270
+ > Schultz, D.T., Blümel, A., Destanović, D., Sarigol, F., Simakov, O. (2024).
271
+ > *Topological mixing and irreversibility in animal chromosome evolution.*
272
+ > bioRxiv. [doi:10.1101/2024.07.29.605683](https://doi.org/10.1101/2024.07.29.605683)
273
+
274
+ For background on the topological framework for comparative genomics, see:
275
+
276
+ > Schultz, D.T., Simakov, O. (2026).
277
+ > *Topological Approaches in Animal Comparative Genomics.*
278
+ > Annual Review of Animal Biosciences 14(1), 17–48.
279
+ > [doi:10.1146/annurev-animal-030424-084541](https://doi.org/10.1146/annurev-animal-030424-084541)
280
+
281
+ See also [`CITATION.cff`](CITATION.cff).
282
+
283
+ ## License
284
+
285
+ MIT — see [`LICENSE`](LICENSE).
egt-0.1.0/README.md ADDED
@@ -0,0 +1,245 @@
1
+ # egt — Evolutionary Genome Topology
2
+
3
+ `egt` is a Python / Snakemake analysis toolkit for characterizing
4
+ chromosome evolution across metazoan genomes. It builds on reciprocal-best-hits
5
+ data from [`odp`](https://github.com/conchoecia/odp) and provides tools for:
6
+
7
+ - ALG (ancestral linkage group) fusion, dispersal, and rate analyses
8
+ - PhyloTreeUMAP: manifold projection of per-species ALG state (MGT, MLT, and
9
+ one-dot-one-genome variants)
10
+ - perspective-chromosome reconstruction with Monte Carlo support
11
+ - branch-wise rate analyses against a calibrated tree
12
+ - Fourier-period analysis of rate time series
13
+ - phylogenetic subsampling, tree prep, taxonomy utilities
14
+
15
+ ## Getting Started
16
+
17
+ ```sh
18
+ git clone https://github.com/conchoecia/egt.git
19
+ cd egt
20
+ python -m venv .venv && source .venv/bin/activate
21
+ pip install -e .
22
+
23
+ egt --help
24
+ bash tests/smoke/test_cli.sh
25
+ ```
26
+
27
+ Primary input is a directory of per-species RBH files produced by `odp`
28
+ against the BCnS ALG database. From there, most analyses are a single
29
+ `egt <subcommand>` call or a Snakefile under `workflows/`.
30
+
31
+ ## Quick Start
32
+
33
+ ### PhyloTreeUMAP — manifold projection of per-species ALG state
34
+
35
+ ```sh
36
+ # 1. build per-sample distance matrices + sampledf
37
+ egt phylotreeumap build-distances \
38
+ --rbh-dir /path/to/rbh_files \
39
+ --alg-name BCnSSimakov2022 \
40
+ --sampledf-out GTUMAP/sampledf.tsv \
41
+ --distance-dir GTUMAP/distance_matrices/
42
+
43
+ # 2. index ALG locus pairs
44
+ egt phylotreeumap algcomboix \
45
+ --alg-rbh /path/to/LG_db/BCnSSimakov2022/BCnSSimakov2022.rbh \
46
+ --output GTUMAP/alg_combo_to_ix.tsv
47
+
48
+ # 3. run the UMAP + HTML plot (MGT / MLT / ODOG variants)
49
+ egt phylotreeumap mgt-mlt-umap --help
50
+ ```
51
+
52
+ ### ALG fusion analysis on a calibrated tree
53
+
54
+ ```sh
55
+ egt alg-fusions --help
56
+ ```
57
+
58
+ ### Perspective-chromosome tree mapping + Monte Carlo rates
59
+
60
+ ```sh
61
+ egt perspchrom-df-to-tree --help
62
+ ```
63
+
64
+ ### Rate analyses, Fourier periodicity, branch stats
65
+
66
+ ```sh
67
+ egt branch-stats-vs-time --help
68
+ egt fourier-of-rates --help
69
+ egt fourier-support-vs-time --help
70
+ egt collapsed-tree --help
71
+ egt tree-changes --help
72
+ egt decay-pairwise --help
73
+ egt decay-many-species --help
74
+ ```
75
+
76
+ ### Phylogeny preparation
77
+
78
+ ```sh
79
+ egt taxids-to-newick --help
80
+ egt newick-to-common-ancestors --help
81
+ ```
82
+
83
+ ## Users' Guide
84
+
85
+ `egt` is a collection of analysis scripts rather than a monolithic pipeline.
86
+ Each script is also registered as a subcommand of the `egt` console script:
87
+
88
+ ```sh
89
+ egt alg-fusions --help
90
+ # equivalent to
91
+ python -m egt.plot_alg_fusions --help
92
+ ```
93
+
94
+ ### Installation
95
+
96
+ ```sh
97
+ git clone https://github.com/conchoecia/egt.git
98
+ cd egt
99
+ python -m venv .venv && source .venv/bin/activate
100
+ pip install -e .
101
+ ```
102
+
103
+ ### Python requirements
104
+
105
+ Python 3.10 or newer. `pip install -e .` pulls the deps from `pyproject.toml`:
106
+
107
+ - numpy, pandas, scipy, scikit-learn, matplotlib, networkx, Pillow
108
+ - umap-learn[plot] — UMAP + the plotting extras needed by PhyloTreeUMAP
109
+ - bokeh — interactive HTML plots
110
+ - ete4 — taxonomy trees and NCBI taxid handling
111
+ - snakemake (>=7, <9)
112
+ - pyyaml
113
+
114
+ Conda equivalent:
115
+
116
+ ```sh
117
+ mamba install -c conda-forge -c bioconda \
118
+ python=3.11 numpy pandas scipy scikit-learn matplotlib networkx pillow \
119
+ "umap-learn" bokeh ete4 "snakemake<9" pyyaml
120
+ pip install --no-deps -e .
121
+ ```
122
+
123
+ ### Upstream tools
124
+
125
+ `egt` consumes outputs of several companion tools:
126
+
127
+ - [`odp`](https://github.com/conchoecia/odp) — per-species RBH files, ALG
128
+ databases (BCnSSimakov2022 etc.)
129
+ - [`chrombase`](https://github.com/conchoecia/chrombase) — chromosome-scale
130
+ NCBI genome database builder
131
+ - [`genbargo`](https://github.com/conchoecia/genbargo) — embargo-aware
132
+ assembly curation
133
+ - [`chromsim`](https://github.com/conchoecia/chromsim) — chromosome-evolution
134
+ simulations
135
+
136
+ ### CLI overview
137
+
138
+ ```
139
+ phylotreeumap — UMAP-over-ALG-topology (MGT, MLT, ODOG subcommands)
140
+ phylotreeumap-subsample — subsample species phylogenetically with per-clade caps
141
+ alg-fusions — plot fusion events on a phylogeny (canonical v3)
142
+ alg-dispersion — plot ALG dispersion across species
143
+ perspchrom-df-to-tree — map perspective-chromosome changes onto a tree (Monte Carlo)
144
+ decay-pairwise — pairwise ALG-decay analysis
145
+ decay-many-species — cross-species ALG conservation / decay
146
+ chrom-number-vs-changes — chromosome count vs rearrangement-rate scatter
147
+ branch-stats-vs-time — branch statistics against geologic time
148
+ branch-stats-tree — branch statistics laid out on a tree
149
+ branch-stats-tree-pair — paired branch-stats tree plots
150
+ collapsed-tree — collapsed-tree visualization
151
+ tree-changes — per-branch changes on a tree
152
+ fourier-of-rates — Fourier analysis of chromosomal change rates
153
+ fourier-support-vs-time — Fourier-support-vs-time plots
154
+ count-unique-changes — count unique changes per branch
155
+ defining-features — identify clade-defining features
156
+ defining-features-plot — plot defining features
157
+ defining-features-plotRBH — plot defining features on RBH dataframes
158
+ taxids-to-newick — build a Newick tree from NCBI taxids
159
+ newick-to-common-ancestors — divergence-time annotation from a timetree
160
+ algs-split-across-scaffolds — find ALGs split across scaffolds
161
+ get-assembly-sizes — summarize assembly sizes
162
+ pull-entries-from-yaml — select rows from a YAML sample list
163
+ aggregate-filechecker — aggregate filechecker benchmarks
164
+ aggregate-filesizes — aggregate file-size summaries
165
+ join-supplementary-tables — join table fragments
166
+ phylotreeumap-plotdfs — PhyloTreeUMAP plotting dataframe helper
167
+ ```
168
+
169
+ ### Snakemake workflows
170
+
171
+ Multi-stage Snakemake definitions live under `workflows/`:
172
+
173
+ ```
174
+ workflows/
175
+ ├── phylotree_umap.smk
176
+ ├── phylotree_umap_subsampling.smk
177
+ ├── perspchrom_df_stats_and_mc.smk
178
+ ├── annotate_sample_df.smk
179
+ ├── sample_to_num_chromosomes.smk
180
+ ├── odol_annotate_blast.smk
181
+ └── pipeline/
182
+ ├── README.md
183
+ ├── config.template.yaml
184
+ └── run.sh
185
+ ```
186
+
187
+ Each workflow is standalone and parameterized via a YAML config.
188
+
189
+ ### Input file formats
190
+
191
+ - **RBH files** (`.rbh`) — tab-separated reciprocal-best-hits output of `odp`.
192
+ Filenames must embed the NCBI taxid as the second hyphen-separated field,
193
+ e.g. `speciesname-7777-something.rbh`.
194
+ - **Sample dataframe** (`sampledf.tsv`) — output of
195
+ `egt phylotreeumap build-distances`; consumed by most downstream commands.
196
+ - **ALG database RBH** — e.g. `BCnSSimakov2022.rbh`, from `odp`'s LG_db.
197
+ - **Newick trees** — ete4-readable. `egt taxids-to-newick` emits these.
198
+ - **Divergence-time tables** — TSV, as accepted by
199
+ `egt newick-to-common-ancestors`.
200
+
201
+ ## Layout
202
+
203
+ ```
204
+ egt/
205
+ ├── src/egt/ — Python package
206
+ │ ├── cli.py — argparse dispatcher
207
+ │ ├── _vendor/ — vendored, frozen plotting utilities
208
+ │ ├── legacy/ — prior versions of plot_ALG_fusions kept for parity
209
+ │ └── *.py — one module per subcommand
210
+ ├── workflows/ — Snakemake workflows
211
+ ├── configs/ — example configs
212
+ ├── data/ — small bundled data
213
+ ├── tests/
214
+ │ ├── testdb/ — mini_hydra + mini_urchin fixtures
215
+ │ └── smoke/test_cli.sh — CLI smoke test
216
+ └── docs/
217
+ ```
218
+
219
+ ## Related tools
220
+
221
+ - [`odp`](https://github.com/conchoecia/odp)
222
+ - [`chrombase`](https://github.com/conchoecia/chrombase)
223
+ - [`genbargo`](https://github.com/conchoecia/genbargo)
224
+ - [`chromsim`](https://github.com/conchoecia/chromsim)
225
+
226
+ ## Citing egt
227
+
228
+ If you use this toolkit, please cite:
229
+
230
+ > Schultz, D.T., Blümel, A., Destanović, D., Sarigol, F., Simakov, O. (2024).
231
+ > *Topological mixing and irreversibility in animal chromosome evolution.*
232
+ > bioRxiv. [doi:10.1101/2024.07.29.605683](https://doi.org/10.1101/2024.07.29.605683)
233
+
234
+ For background on the topological framework for comparative genomics, see:
235
+
236
+ > Schultz, D.T., Simakov, O. (2026).
237
+ > *Topological Approaches in Animal Comparative Genomics.*
238
+ > Annual Review of Animal Biosciences 14(1), 17–48.
239
+ > [doi:10.1146/annurev-animal-030424-084541](https://doi.org/10.1146/annurev-animal-030424-084541)
240
+
241
+ See also [`CITATION.cff`](CITATION.cff).
242
+
243
+ ## License
244
+
245
+ MIT — see [`LICENSE`](LICENSE).
@@ -0,0 +1,66 @@
1
+ [build-system]
2
+ requires = ["setuptools>=64", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "egt"
7
+ version = "0.1.0"
8
+ description = "Evolutionary Genome Topology — analysis toolkit for chromosome evolution across metazoan genomes using reciprocal-best-hits data."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { text = "MIT" }
12
+ authors = [
13
+ { name = "Darrin T. Schultz", email = "darrin.schultz@univie.ac.at" },
14
+ ]
15
+ keywords = [
16
+ "comparative genomics",
17
+ "chromosome evolution",
18
+ "ancestral linkage groups",
19
+ "ALG",
20
+ "synteny",
21
+ "UMAP",
22
+ "phylogenetics",
23
+ ]
24
+ classifiers = [
25
+ "Development Status :: 4 - Beta",
26
+ "Intended Audience :: Science/Research",
27
+ "License :: OSI Approved :: MIT License",
28
+ "Operating System :: POSIX",
29
+ "Programming Language :: Python :: 3",
30
+ "Programming Language :: Python :: 3.10",
31
+ "Programming Language :: Python :: 3.11",
32
+ "Programming Language :: Python :: 3.12",
33
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
34
+ ]
35
+ dependencies = [
36
+ "numpy",
37
+ "pandas",
38
+ "scipy",
39
+ "scikit-learn",
40
+ "matplotlib",
41
+ "networkx",
42
+ "Pillow",
43
+ "umap-learn[plot]",
44
+ "bokeh",
45
+ "ete4",
46
+ "snakemake>=7,<9",
47
+ "pyyaml",
48
+ ]
49
+
50
+ [project.optional-dependencies]
51
+ dev = ["pytest", "pyflakes"]
52
+
53
+ [project.urls]
54
+ Homepage = "https://github.com/conchoecia/egt"
55
+ "Source code" = "https://github.com/conchoecia/egt"
56
+ "Manuscript" = "https://github.com/conchoecia/egt/blob/main/docs/manuscript.md"
57
+ "odp" = "https://github.com/conchoecia/odp"
58
+
59
+ [project.scripts]
60
+ egt = "egt.cli:main"
61
+
62
+ [tool.setuptools.packages.find]
63
+ where = ["src"]
64
+
65
+ [tool.setuptools.package-data]
66
+ egt = ["py.typed"]
egt-0.1.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
File without changes