egt 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of egt might be problematic. Click here for more details.

Files changed (50) hide show
  1. egt/__init__.py +1 -0
  2. egt/_vendor/__init__.py +0 -0
  3. egt/_vendor/odp_plotting_functions.py +81 -0
  4. egt/aggregate_filechecker_benchmarks.py +93 -0
  5. egt/aggregate_filesizes.py +50 -0
  6. egt/algs_split_across_scaffolds.py +204 -0
  7. egt/annotate_sample_df.py +628 -0
  8. egt/bokeh_helper.py +33 -0
  9. egt/cli.py +76 -0
  10. egt/count_unique_changes_per_branch.py +117 -0
  11. egt/defining_features.py +282 -0
  12. egt/defining_features_plot.py +204 -0
  13. egt/defining_features_plotRBH.py +700 -0
  14. egt/fourier_of_rates.py +840 -0
  15. egt/fourier_spectral_background.py +52 -0
  16. egt/get_assembly_sizes.py +49 -0
  17. egt/join_supplementary_tables.py +90 -0
  18. egt/legacy/__init__.py +0 -0
  19. egt/legacy/defining_features_plot2.py +406 -0
  20. egt/legacy/plot_alg_fusions_v1.py +1382 -0
  21. egt/legacy/plot_alg_fusions_v2.py +1349 -0
  22. egt/newick_to_common_ancestors.py +3271 -0
  23. egt/odol_annotate_blast.py +397 -0
  24. egt/perspchrom_df_to_tree.py +2683 -0
  25. egt/phylotreeumap.py +4551 -0
  26. egt/phylotreeumap_plotdfs.py +2023 -0
  27. egt/phylotreeumap_subsample.py +693 -0
  28. egt/phylotreeumap_testpixels.py +139 -0
  29. egt/plot_alg_dispersion.py +466 -0
  30. egt/plot_alg_fusions.py +3208 -0
  31. egt/plot_branch_stats_advanced.py +305 -0
  32. egt/plot_branch_stats_tree.py +346 -0
  33. egt/plot_branch_stats_tree_pair.py +238 -0
  34. egt/plot_branch_stats_vs_time.py +2905 -0
  35. egt/plot_chrom_number_vs_changes.py +689 -0
  36. egt/plot_collapsed_tree.py +319 -0
  37. egt/plot_decay_many_species.py +437 -0
  38. egt/plot_decay_pairwise_steps.py +1466 -0
  39. egt/plot_fourier_support_vs_time.py +134 -0
  40. egt/plot_tree_changes.py +111 -0
  41. egt/pull_entries_from_yaml.py +108 -0
  42. egt/rbh_tools.py +441 -0
  43. egt/taxid_tools.py +75 -0
  44. egt/taxids_to_newick.py +782 -0
  45. egt-0.1.0.dist-info/METADATA +285 -0
  46. egt-0.1.0.dist-info/RECORD +50 -0
  47. egt-0.1.0.dist-info/WHEEL +5 -0
  48. egt-0.1.0.dist-info/entry_points.txt +2 -0
  49. egt-0.1.0.dist-info/licenses/LICENSE +21 -0
  50. egt-0.1.0.dist-info/top_level.txt +1 -0
egt/__init__.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
File without changes
@@ -0,0 +1,81 @@
1
+ """
2
+ These are the plotting functions used by many ODP programs
3
+ """
4
+
5
+ import matplotlib
6
+
7
+ def format_matplotlib():
8
+ """format the fonts and print options for the plots"""
9
+ font = {'family' : 'sans-serif',
10
+ 'sans-serif' : 'DejaVu Sans', # removing this after finding that many users don't have Helvetica installed. :( https://github.com/conchoecia/odp/issues/34
11
+ 'weight' : 'normal',
12
+ 'size' : 12}
13
+
14
+ matplotlib.rc('font', **font)
15
+
16
+ grid = {"color": ".95", "linestyle": "-"}
17
+ # grid style
18
+ matplotlib.rc('grid', **grid)
19
+
20
+ # Preserve the vertical order of embedded images:
21
+ matplotlib.rcParams['image.composite_image'] = False
22
+ # text as font in pdf
23
+ matplotlib.rcParams['pdf.fonttype'] = 42
24
+ matplotlib.rcParams['ps.fonttype'] = 42
25
+
26
+ def plot_decay(datastruct, outpath, outtsv):
27
+ """
28
+ This plots the decay of an ALG between number of genes in the main chromosome,
29
+ and the number of genes in smaller chromosomes
30
+
31
+ Parameters:
32
+ - 0th datastruct - the data structure described below with the data to plot
33
+ - 1st outpath - the path to save the plot to
34
+ - 2nd outtsv - the path to save the processed data to
35
+
36
+ The input is this datastructure:
37
+ ALG dataframe
38
+ - key: the ALG name
39
+ - 0th element is a list of genes that are on orthologous chroms
40
+ - 1st element is a list of genes that are not on orthologous chroms
41
+ - The key for these dicts is the scaffold name
42
+ - The value for both of these dicts is the count of genes on that scaffold in that category
43
+ """
44
+ import matplotlib.pyplot as plt
45
+ # convert the datastruct to a simple dataframe
46
+ ALGs = list(datastruct.keys())
47
+ conserved = [sum(datastruct[x][0].values()) for x in ALGs]
48
+ scattered = [sum(datastruct[x][1].values()) for x in ALGs]
49
+ total = [conserved[i] + scattered[i] for i in range(len(ALGs))]
50
+ # turn those columns into a dataframe
51
+ df = pd.DataFrame({"ALG":ALGs, "conserved":conserved, "scattered":scattered, "total":total})
52
+ df = df.sort_values(by="total", ascending=False)
53
+ df = df.reset_index(drop=True)
54
+ df.to_csv(outtsv, sep="\t", index=False)
55
+
56
+ fig, ax1 = plt.subplots()
57
+
58
+ color = 'tab:red'
59
+ ax1.set_xlabel('ALG size')
60
+ ax1.set_ylabel('Distribution size', color=color)
61
+ ax1.tick_params(axis='y', labelcolor=color)
62
+
63
+ for index, row in df.iterrows():
64
+ x1 = row["total"]
65
+ x2 = row["total"]
66
+ y1 = 0
67
+ y2 = row["total"]
68
+ ax1.plot([x1,x2],[y1,y2],'k-')
69
+ ax1.plot(df["total"], df["total"],'ro')
70
+ ax1.plot(df["total"], df["scattered"],'bo')
71
+
72
+ ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis
73
+ ax2.set_ylim([0, 100])
74
+
75
+ color = 'tab:blue'
76
+ ax2.set_ylabel('percent conserved on ALGs', color=color) # we already handled the x-label with ax1
77
+ ax2.plot(df["total"], 100*(df["conserved"]/df["total"]), "b-")
78
+ ax2.tick_params(axis='y', labelcolor=color)
79
+
80
+ fig.tight_layout() # otherwise the right y-label is slightly clipped
81
+ plt.savefig(outpath)
@@ -0,0 +1,93 @@
1
+ #!/usr/bin/env python3
2
+ """Aggregate benchmark files from odp_filechecker and append input file sizes.
3
+
4
+ This script reads Snakemake benchmark files produced by the rules in
5
+ `scripts/odp_filechecker`. Each benchmark file name is expected to contain the
6
+ sample (genome assembly accession) and the rule name, e.g.:
7
+ ```
8
+ benchmarks/check_genome_legality/GCF_00000000.1.check_genome_legality.benchmark.txt
9
+ ```
10
+
11
+ For every benchmark entry the script looks up the input files for the
12
+ corresponding sample in the ODP configuration file and records their sizes in
13
+ bytes. The benchmark metrics together with the computed file sizes are written
14
+ to a single TSV file.
15
+ """
16
+ import argparse
17
+ import glob
18
+ import os
19
+ import re
20
+ from typing import Dict
21
+
22
+ import pandas as pd
23
+ import yaml
24
+
25
+
26
+ def _input_paths(rule: str, sample: str, config: Dict) -> Dict[str, str]:
27
+ """Return a mapping of column name to input file path for a rule/sample."""
28
+ sp_conf = config["species"][sample]
29
+ if rule == "check_genome_legality":
30
+ return {"genome_bytes": sp_conf["genome"]}
31
+ if rule == "check_protein_legality":
32
+ return {"proteins_bytes": sp_conf["proteins"]}
33
+ if rule == "check_chrom_legality":
34
+ return {
35
+ "genome_bytes": sp_conf["genome"],
36
+ "proteins_bytes": sp_conf["proteins"],
37
+ "chrom_bytes": sp_conf["chrom"],
38
+ }
39
+ return {}
40
+
41
+
42
+ def aggregate(config_path: str, benchmarks_dir: str) -> pd.DataFrame:
43
+ """Read benchmark files and append input file sizes."""
44
+ with open(config_path) as fh:
45
+ config = yaml.safe_load(fh)
46
+
47
+ records = []
48
+ pattern = re.compile(r"(?P<sample>.+)\.(?P<rule>[^.]+)\.benchmark\.txt$")
49
+
50
+
51
+ for bfile in glob.glob(os.path.join(benchmarks_dir, "**", "*.benchmark.txt"), recursive=True):
52
+ m = pattern.search(os.path.basename(bfile))
53
+ if not m:
54
+ continue
55
+ sample = m.group("sample")
56
+ rule = m.group("rule")
57
+
58
+ bench_df = pd.read_csv(bfile, sep="\t")
59
+ if bench_df.empty:
60
+ continue
61
+ row = bench_df.iloc[0].to_dict()
62
+ row.update({"sample": sample, "rule": rule})
63
+
64
+ try:
65
+ inputs = _input_paths(rule, sample, config)
66
+ except KeyError:
67
+ inputs = {}
68
+ for col, path in inputs.items():
69
+ row[col] = os.path.getsize(path) if os.path.exists(path) else pd.NA
70
+ records.append(row)
71
+ return pd.DataFrame(records)
72
+
73
+
74
+ def main(argv=None):
75
+ parser = argparse.ArgumentParser(description=__doc__)
76
+ parser.add_argument("--config", required=True, help="Path to ODP config YAML")
77
+ parser.add_argument(
78
+ "--benchmarks", required=True, help="Directory with benchmark files"
79
+ )
80
+ parser.add_argument(
81
+ "--out",
82
+ default="aggregated_benchmarks_with_sizes.tsv",
83
+ help="Output TSV file",
84
+ )
85
+ args = parser.parse_args(argv)
86
+
87
+ df = aggregate(args.config, args.benchmarks)
88
+ df.to_csv(args.out, sep="\t", index=False)
89
+ return 0
90
+
91
+
92
+ if __name__ == "__main__":
93
+ raise SystemExit(main())
@@ -0,0 +1,50 @@
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import csv
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ import yaml # pip install pyyaml
8
+
9
+ def bytes_to_mb(nbytes: int, base: int = 1024) -> float:
10
+ return nbytes / (base * base)
11
+
12
+ def size_mb_or_na(path: str | None, base: int = 1024) -> str:
13
+ if not path:
14
+ return "NA"
15
+ p = Path(path)
16
+ try:
17
+ mb = bytes_to_mb(p.stat().st_size, base)
18
+ return f"{mb:.3f}"
19
+ except FileNotFoundError:
20
+ print(f"WARNING: file not found -> {p}", file=sys.stderr)
21
+ return "NA"
22
+
23
+ def main(argv=None):
24
+ ap = argparse.ArgumentParser(description="Summarize input file sizes from config.yaml")
25
+ ap.add_argument("-c", "--config", help="Path to config.yaml")
26
+ ap.add_argument("-o", "--out", help="Output TSV path",
27
+ type=str, default="input_filesizes.tsv")
28
+ ap.add_argument("--base", type=int, choices=(1000, 1024), default=1024,
29
+ help="MB base (1000 for decimal MB, 1024 for MiB; default 1024)")
30
+ args = ap.parse_args(argv)
31
+
32
+ cfg = yaml.safe_load(Path(args.config).read_text())
33
+ species = cfg.get("species", {})
34
+
35
+ out_path = Path(args.out)
36
+ out_path.parent.mkdir(parents=True, exist_ok=True)
37
+
38
+ with out_path.open("w", newline="") as fh:
39
+ w = csv.writer(fh, delimiter="\t")
40
+ w.writerow(["sample", "assembly_accession", "proteins_MB", "chrom_MB", "genome_MB"])
41
+ for sample, info in species.items():
42
+ acc = info.get("assembly_accession", "")
43
+ proteins_mb = size_mb_or_na(info.get("proteins"), args.base)
44
+ chrom_mb = size_mb_or_na(info.get("chrom"), args.base)
45
+ genome_mb = size_mb_or_na(info.get("genome"), args.base)
46
+ w.writerow([sample, acc, proteins_mb, chrom_mb, genome_mb])
47
+ return 0
48
+
49
+ if __name__ == "__main__":
50
+ raise SystemExit(main())
@@ -0,0 +1,204 @@
1
+ #!/usr/bin/env python
2
+
3
+ """
4
+ Program : ALGs_split_across_two_or_more_scaffolds.py
5
+ Language : python 3
6
+ Date : 2024-02-05
7
+ Author : Darrin T. Schultz
8
+ Email : darrin.schultz@univie.ac.at
9
+ Github : https://github.com/conchoecia/odp
10
+ Support : For issues or questions, please search if the topic has been discussed already
11
+ on github and open a new issue if not: https://github.com/conchoecia/odp/issues
12
+ License : GNU GENERAL PUBLIC LICENSE, Version 3, 29 June 2007. See the LICENSE file.
13
+ Citation : If you use this software for your scientific publication, please cite:
14
+ Schultz, DT; Haddock, SHD; Bredeson, JV; Green, RE; Simakov, O & Rokhsar, DS
15
+ Ancient gene linkages support ctenophores as sister to other animals. Nature (2023).
16
+ https://doi.org/10.1038/s41586-023-05936-6
17
+
18
+ Description:
19
+ - This program takes in a directory of .rbh files and makes a datastructure of all of the significantly-occurring gene groups on different chromosomes.
20
+
21
+ Usage instructions:
22
+ - See https://github.com/conchoecia/odp#getting-started
23
+ """
24
+
25
+ # odp stuff to format the plot
26
+ import os
27
+ import sys
28
+ # ODP-specific imports
29
+ thisfile_path = os.path.dirname(os.path.realpath(__file__))
30
+ scripts_path = os.path.join(thisfile_path, "../scripts")
31
+ sys.path.insert(1, scripts_path)
32
+ source_path = os.path.join(thisfile_path, "../source")
33
+ sys.path.insert(1, source_path)
34
+ from egt._vendor import odp_plotting_functions as odp_plot
35
+ from egt import rbh_tools
36
+ import argparse
37
+ import pandas as pd
38
+
39
+ # matplotlib stuff
40
+ import matplotlib.patches as mpatches
41
+ import matplotlib.pyplot as plt
42
+
43
+ def parse_args(argv=None):
44
+ """
45
+ The things we need to know are:
46
+ - -d --rbh_directory - the directory of the rbh files that we will look at
47
+ - -m --minsig - the number of the minimum significance value for the whole_FET column in the rbh files. This is used to filter the rbh files.
48
+ - -a --alg - the name of the ALG set that we are looking at. This should be something that is the header of the rbh files, for example "BCnSSimakov2022"
49
+ """
50
+ parser = argparse.ArgumentParser(description="This program takes in a directory of .rbh files and makes a datastructure of all of the significantly-occurring gene groups on different chromosomes.")
51
+ parser.add_argument("-d", "--rbh_directory", help="The directory of the rbh files that we will look at")
52
+ parser.add_argument("-m", "--minsig", type = float, default = 0.005, help="The minimum significance value for the whole_FET column in the rbh files. This is used to filter the rbh files.")
53
+ parser.add_argument("-a", "--alg", help="The name of the ALG set that we are looking at. This should be something that is the header of the rbh files, for example 'BCnSSimakov2022'")
54
+ args = parser.parse_args(argv)
55
+
56
+ # Check that the directory exists
57
+ if not os.path.exists(args.rbh_directory):
58
+ print("The directory you provided does not exist.")
59
+ return args
60
+
61
+ def plot_chrom_number_vs_number_ALGs_split(ax, splitsdf, min_splits, inferredchromsize):
62
+ """
63
+ Saves a pdf of the plot of the number of ALGs split across two or more scaffolds vs the number of chromosomes.
64
+ returns an axis
65
+ """
66
+ # we need to go through and find all the samples that have at least min_splits
67
+ # first we groupby the sample
68
+ gb = splitsdf.groupby("sample")
69
+ entries = []
70
+ for name, group in gb:
71
+ # samplename
72
+ samplename = group["sample"].unique()[0]
73
+ # get the number of ALGs that are significant at all
74
+ present_ALG_num = len(group["gene_group"].unique())
75
+ # num_ALGs on at least n scaffolds
76
+ num_ALGs = len([x for x in group["gene_group"].value_counts() if x >= min_splits])
77
+ entries.append({"sample": samplename,
78
+ "num_chroms": inferredchromsize[samplename],
79
+ "num_ALGs": present_ALG_num,
80
+ "num_ALGs_min_splits": num_ALGs})
81
+ # make a dataframe of all of the entries
82
+ df = pd.DataFrame(entries)
83
+ print(df)
84
+ # for now make a simple scatter plot
85
+ ax.scatter(df["num_chroms"], df["num_ALGs_min_splits"], alpha = 0.1, lw = 0)
86
+ ax.set_xlabel("Number of chromosomes")
87
+ ax.set_ylabel(f"Number of ALGs split across {min_splits} or more scaffolds")
88
+ return ax
89
+
90
+ def plot_chrom_number_vs_number_ALGs_perchrom(ax, splitsdf, inferredchromsize):
91
+ """
92
+ For every genome, plots the number of chromosomes (x) vs the number of ALGs on each chromosome (y)
93
+ For every genome, we will plot every genome.
94
+ """
95
+ # we need to go through and find all the samples that have at least min_splits
96
+ # first we groupby the sample
97
+ gb = splitsdf.groupby("sample")
98
+ entries = []
99
+ for name, group in gb:
100
+ # samplename
101
+ samplename = group["sample"].unique()[0]
102
+ # scafcounts
103
+ scafcounts = group["scaffold"].value_counts()
104
+ for chrom in scafcounts.index:
105
+ entries.append({"sample": samplename,
106
+ "chrom": chrom,
107
+ "num_chroms": inferredchromsize[samplename],
108
+ "ALGs_on_chrom": scafcounts[chrom]})
109
+ # make a dataframe of all of the entries
110
+ df = pd.DataFrame(entries)
111
+ #ax.scatter(df["num_chroms"], df["ALGs_on_chrom"], alpha = 0.01, lw = 0)
112
+ #ax.set_xlabel("Number of chromosomes")
113
+ #ax.set_ylabel(f"Number of ALGs on each chromosome")
114
+ # I don't like this plot.
115
+
116
+ # We do a little more processing, instead try plotting the mean number of ALGs on each chromosome
117
+ entries = []
118
+ gb = df.groupby(["sample"])
119
+ for name, group in gb:
120
+ entries.append({"sample": name,
121
+ "num_chroms": group["num_chroms"].unique()[0],
122
+ "mean_ALGs_on_chrom": group["ALGs_on_chrom"].mean()})
123
+ df = pd.DataFrame(entries)
124
+ print(df)
125
+
126
+ # for now make a simple scatter plot
127
+ ax.scatter(df["num_chroms"], df["mean_ALGs_on_chrom"], alpha = 0.1, lw = 0)
128
+ ax.set_xlabel("Number of chromosomes")
129
+ ax.set_ylabel(f"Mean number of ALGs on each chromosome")
130
+ return ax
131
+
132
+ def main(argv=None):
133
+ args = parse_args(argv)
134
+
135
+ rbh_files = [os.path.join(args.rbh_directory, f) for f in os.listdir(args.rbh_directory) if f.endswith(".rbh")]
136
+ # for testing purposes, just get the top 100 files
137
+ #rbh_files = rbh_files[100]
138
+ #rbh_files = [x for x in rbh_files if "Lepisosteus" in x]
139
+
140
+ # we need something of sample to chromnum
141
+ sample_to_chromnum = {}
142
+ # now we go through the files
143
+ entries = []
144
+ for i in range(len(rbh_files)):
145
+ # make a counter that goes back to the beginning of the line
146
+ print(f"\r analyzing {i+1}/{len(rbh_files)}", end = "")
147
+ rbhfile = rbh_files[i]
148
+ rbhdf = rbh_tools.parse_rbh(rbhfile)
149
+ splitdf, samplename = rbh_tools.rbhdf_to_alglocdf(rbhdf, args.minsig, args.alg)
150
+ chromnum = rbh_tools.rbh_to_scafnum(rbhdf, samplename)
151
+ sample_to_chromnum[samplename] = chromnum
152
+ entries.append(splitdf)
153
+ print()
154
+ print(f"\r Done analyzing {len(rbh_files)}/{len(rbh_files)}", end = "")
155
+
156
+ # make a dataframe of all of the entries
157
+ splitsdf = pd.concat(entries)
158
+
159
+ # make a plot
160
+ # CALL THIS TO GET THE VISUAL STYLE WE NEED
161
+ odp_plot.format_matplotlib()
162
+ fw = 10
163
+ fh = 20
164
+
165
+ fig = plt.figure(figsize=(fw, fh))
166
+ axes = []
167
+
168
+ #for aligning all the panels
169
+ left1 = 0.6
170
+ left2 = 6.5
171
+ left3 = 12.5
172
+ left4 = 18.5
173
+
174
+ axes = []
175
+
176
+ # This panel is the number of chromosomes vs the number of changes
177
+ bottom1 = 0.6
178
+ bottom2 = 7
179
+ paneldim = 5
180
+ # we start with a single panel
181
+ plot_params = [left1 /fw, # left offset
182
+ bottom1 /fh, # bottom offset
183
+ paneldim /fw, # width
184
+ paneldim /fh] # height
185
+ axes.append(fig.add_axes(plot_params))
186
+ axes[-1] = plot_chrom_number_vs_number_ALGs_split(axes[-1], splitsdf,
187
+ 2, sample_to_chromnum)
188
+
189
+ # This panel is the number of chromosomes vs the number of ALGs on each chromosome
190
+ paneldim = 5
191
+ # we start with a single panel
192
+ plot_params = [left1 /fw, # left offset
193
+ bottom2 /fh, # bottom offset
194
+ paneldim/fw, # width
195
+ paneldim/fh] # height
196
+ axes.append(fig.add_axes(plot_params))
197
+ axes[-1] = plot_chrom_number_vs_number_ALGs_perchrom(axes[-1], splitsdf, sample_to_chromnum)
198
+
199
+ outfilename = "ALGs_split_across_two_or_more_scaffolds.pdf"
200
+ fig.savefig(outfilename, bbox_inches="tight")
201
+ return 0
202
+
203
+ if __name__ == "__main__":
204
+ raise SystemExit(main())