PyPI - egt - Versions diffs - 0.1.0__py3-none-any.whl - Mend

egt 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of egt might be problematic. Click here for more details.

Files changed (50) hide show

egt/__init__.py +1 -0
egt/_vendor/__init__.py +0 -0
egt/_vendor/odp_plotting_functions.py +81 -0
egt/aggregate_filechecker_benchmarks.py +93 -0
egt/aggregate_filesizes.py +50 -0
egt/algs_split_across_scaffolds.py +204 -0
egt/annotate_sample_df.py +628 -0
egt/bokeh_helper.py +33 -0
egt/cli.py +76 -0
egt/count_unique_changes_per_branch.py +117 -0
egt/defining_features.py +282 -0
egt/defining_features_plot.py +204 -0
egt/defining_features_plotRBH.py +700 -0
egt/fourier_of_rates.py +840 -0
egt/fourier_spectral_background.py +52 -0
egt/get_assembly_sizes.py +49 -0
egt/join_supplementary_tables.py +90 -0
egt/legacy/__init__.py +0 -0
egt/legacy/defining_features_plot2.py +406 -0
egt/legacy/plot_alg_fusions_v1.py +1382 -0
egt/legacy/plot_alg_fusions_v2.py +1349 -0
egt/newick_to_common_ancestors.py +3271 -0
egt/odol_annotate_blast.py +397 -0
egt/perspchrom_df_to_tree.py +2683 -0
egt/phylotreeumap.py +4551 -0
egt/phylotreeumap_plotdfs.py +2023 -0
egt/phylotreeumap_subsample.py +693 -0
egt/phylotreeumap_testpixels.py +139 -0
egt/plot_alg_dispersion.py +466 -0
egt/plot_alg_fusions.py +3208 -0
egt/plot_branch_stats_advanced.py +305 -0
egt/plot_branch_stats_tree.py +346 -0
egt/plot_branch_stats_tree_pair.py +238 -0
egt/plot_branch_stats_vs_time.py +2905 -0
egt/plot_chrom_number_vs_changes.py +689 -0
egt/plot_collapsed_tree.py +319 -0
egt/plot_decay_many_species.py +437 -0
egt/plot_decay_pairwise_steps.py +1466 -0
egt/plot_fourier_support_vs_time.py +134 -0
egt/plot_tree_changes.py +111 -0
egt/pull_entries_from_yaml.py +108 -0
egt/rbh_tools.py +441 -0
egt/taxid_tools.py +75 -0
egt/taxids_to_newick.py +782 -0
egt-0.1.0.dist-info/METADATA +285 -0
egt-0.1.0.dist-info/RECORD +50 -0
egt-0.1.0.dist-info/WHEEL +5 -0
egt-0.1.0.dist-info/entry_points.txt +2 -0
egt-0.1.0.dist-info/licenses/LICENSE +21 -0
egt-0.1.0.dist-info/top_level.txt +1 -0

egt/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.1.0"

egt/_vendor/__init__.py ADDED Viewed

File without changes

egt/_vendor/odp_plotting_functions.py ADDED Viewed

@@ -0,0 +1,81 @@
+"""
+These are the plotting functions used by many ODP programs
+"""
+import matplotlib
+def format_matplotlib():
+    """format the fonts and print options for the plots"""
+    font = {'family' : 'sans-serif',
+            'sans-serif' : 'DejaVu Sans', # removing this after finding that many users don't have Helvetica installed. :( https://github.com/conchoecia/odp/issues/34
+            'weight' : 'normal',
+            'size'   : 12}
+    matplotlib.rc('font', **font)
+    grid = {"color": ".95", "linestyle": "-"}
+    # grid style
+    matplotlib.rc('grid', **grid)
+    # Preserve the vertical order of embedded images:
+    matplotlib.rcParams['image.composite_image'] = False
+    # text as font in pdf
+    matplotlib.rcParams['pdf.fonttype'] = 42
+    matplotlib.rcParams['ps.fonttype'] = 42
+def plot_decay(datastruct, outpath, outtsv):
+    """
+    This plots the decay of an ALG between number of genes in the main chromosome,
+    and the number of genes in smaller chromosomes
+    Parameters:
+      - 0th datastruct - the data structure described below with the data to plot
+      - 1st outpath    - the path to save the plot to
+      - 2nd outtsv     - the path to save the processed data to
+    The input is this datastructure:
+      ALG dataframe
+       - key: the ALG name
+         - 0th element is a list of genes that are on orthologous chroms
+         - 1st element is a list of genes that are not on orthologous chroms
+           - The key for these dicts is the scaffold name
+           - The value for both of these dicts is the count of genes on that scaffold in that category
+    """
+    import matplotlib.pyplot as plt
+    # convert the datastruct to a simple dataframe
+    ALGs = list(datastruct.keys())
+    conserved = [sum(datastruct[x][0].values()) for x in ALGs]
+    scattered = [sum(datastruct[x][1].values()) for x in ALGs]
+    total     = [conserved[i] + scattered[i] for i in range(len(ALGs))]
+    # turn those columns into a dataframe
+    df = pd.DataFrame({"ALG":ALGs, "conserved":conserved, "scattered":scattered, "total":total})
+    df = df.sort_values(by="total", ascending=False)
+    df = df.reset_index(drop=True)
+    df.to_csv(outtsv, sep="\t", index=False)
+    fig, ax1 = plt.subplots()
+    color = 'tab:red'
+    ax1.set_xlabel('ALG size')
+    ax1.set_ylabel('Distribution size', color=color)
+    ax1.tick_params(axis='y', labelcolor=color)
+    for index, row in df.iterrows():
+        x1 = row["total"]
+        x2 = row["total"]
+        y1 = 0
+        y2 = row["total"]
+        ax1.plot([x1,x2],[y1,y2],'k-')
+    ax1.plot(df["total"], df["total"],'ro')
+    ax1.plot(df["total"], df["scattered"],'bo')
+    ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
+    ax2.set_ylim([0, 100])
+    color = 'tab:blue'
+    ax2.set_ylabel('percent conserved on ALGs', color=color)  # we already handled the x-label with ax1
+    ax2.plot(df["total"], 100*(df["conserved"]/df["total"]), "b-")
+    ax2.tick_params(axis='y', labelcolor=color)
+    fig.tight_layout()  # otherwise the right y-label is slightly clipped
+    plt.savefig(outpath)

egt/aggregate_filechecker_benchmarks.py ADDED Viewed

@@ -0,0 +1,93 @@
+#!/usr/bin/env python3
+"""Aggregate benchmark files from odp_filechecker and append input file sizes.
+This script reads Snakemake benchmark files produced by the rules in
+`scripts/odp_filechecker`.  Each benchmark file name is expected to contain the
+sample (genome assembly accession) and the rule name, e.g.:
+```
+benchmarks/check_genome_legality/GCF_00000000.1.check_genome_legality.benchmark.txt
+```
+For every benchmark entry the script looks up the input files for the
+corresponding sample in the ODP configuration file and records their sizes in
+bytes.  The benchmark metrics together with the computed file sizes are written
+to a single TSV file.
+"""
+import argparse
+import glob
+import os
+import re
+from typing import Dict
+import pandas as pd
+import yaml
+def _input_paths(rule: str, sample: str, config: Dict) -> Dict[str, str]:
+    """Return a mapping of column name to input file path for a rule/sample."""
+    sp_conf = config["species"][sample]
+    if rule == "check_genome_legality":
+        return {"genome_bytes": sp_conf["genome"]}
+    if rule == "check_protein_legality":
+        return {"proteins_bytes": sp_conf["proteins"]}
+    if rule == "check_chrom_legality":
+        return {
+            "genome_bytes": sp_conf["genome"],
+            "proteins_bytes": sp_conf["proteins"],
+            "chrom_bytes": sp_conf["chrom"],
+        }
+    return {}
+def aggregate(config_path: str, benchmarks_dir: str) -> pd.DataFrame:
+    """Read benchmark files and append input file sizes."""
+    with open(config_path) as fh:
+        config = yaml.safe_load(fh)
+    records = []
+    pattern = re.compile(r"(?P<sample>.+)\.(?P<rule>[^.]+)\.benchmark\.txt$")
+    for bfile in glob.glob(os.path.join(benchmarks_dir, "**", "*.benchmark.txt"), recursive=True):
+        m = pattern.search(os.path.basename(bfile))
+        if not m:
+            continue
+        sample = m.group("sample")
+        rule = m.group("rule")
+        bench_df = pd.read_csv(bfile, sep="\t")
+        if bench_df.empty:
+            continue
+        row = bench_df.iloc[0].to_dict()
+        row.update({"sample": sample, "rule": rule})
+        try:
+            inputs = _input_paths(rule, sample, config)
+        except KeyError:
+            inputs = {}
+        for col, path in inputs.items():
+            row[col] = os.path.getsize(path) if os.path.exists(path) else pd.NA
+        records.append(row)
+    return pd.DataFrame(records)
+def main(argv=None):
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--config", required=True, help="Path to ODP config YAML")
+    parser.add_argument(
+        "--benchmarks", required=True, help="Directory with benchmark files"
+    )
+    parser.add_argument(
+        "--out",
+        default="aggregated_benchmarks_with_sizes.tsv",
+        help="Output TSV file",
+    )
+    args = parser.parse_args(argv)
+    df = aggregate(args.config, args.benchmarks)
+    df.to_csv(args.out, sep="\t", index=False)
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

egt/aggregate_filesizes.py ADDED Viewed

@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+import argparse
+import csv
+import sys
+from pathlib import Path
+import yaml  # pip install pyyaml
+def bytes_to_mb(nbytes: int, base: int = 1024) -> float:
+    return nbytes / (base * base)
+def size_mb_or_na(path: str | None, base: int = 1024) -> str:
+    if not path:
+        return "NA"
+    p = Path(path)
+    try:
+        mb = bytes_to_mb(p.stat().st_size, base)
+        return f"{mb:.3f}"
+    except FileNotFoundError:
+        print(f"WARNING: file not found -> {p}", file=sys.stderr)
+        return "NA"
+def main(argv=None):
+    ap = argparse.ArgumentParser(description="Summarize input file sizes from config.yaml")
+    ap.add_argument("-c", "--config", help="Path to config.yaml")
+    ap.add_argument("-o", "--out", help="Output TSV path",
+                    type=str, default="input_filesizes.tsv")
+    ap.add_argument("--base", type=int, choices=(1000, 1024), default=1024,
+                    help="MB base (1000 for decimal MB, 1024 for MiB; default 1024)")
+    args = ap.parse_args(argv)
+    cfg = yaml.safe_load(Path(args.config).read_text())
+    species = cfg.get("species", {})
+    out_path = Path(args.out)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    with out_path.open("w", newline="") as fh:
+        w = csv.writer(fh, delimiter="\t")
+        w.writerow(["sample", "assembly_accession", "proteins_MB", "chrom_MB", "genome_MB"])
+        for sample, info in species.items():
+            acc = info.get("assembly_accession", "")
+            proteins_mb = size_mb_or_na(info.get("proteins"), args.base)
+            chrom_mb    = size_mb_or_na(info.get("chrom"), args.base)
+            genome_mb   = size_mb_or_na(info.get("genome"), args.base)
+            w.writerow([sample, acc, proteins_mb, chrom_mb, genome_mb])
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

egt/algs_split_across_scaffolds.py ADDED Viewed

@@ -0,0 +1,204 @@
+#!/usr/bin/env python
+"""
+Program  : ALGs_split_across_two_or_more_scaffolds.py
+Language : python 3
+Date     : 2024-02-05
+Author   : Darrin T. Schultz
+Email    : darrin.schultz@univie.ac.at
+Github   : https://github.com/conchoecia/odp
+Support  : For issues or questions, please search if the topic has been discussed already
+           on github and open a new issue if not: https://github.com/conchoecia/odp/issues
+License  : GNU GENERAL PUBLIC LICENSE, Version 3, 29 June 2007. See the LICENSE file.
+Citation : If you use this software for your scientific publication, please cite:
+           Schultz, DT; Haddock, SHD; Bredeson, JV; Green, RE; Simakov, O & Rokhsar, DS
+           Ancient gene linkages support ctenophores as sister to other animals. Nature (2023).
+           https://doi.org/10.1038/s41586-023-05936-6
+Description:
+  - This program takes in a directory of .rbh files and makes a datastructure of all of the significantly-occurring gene groups on different chromosomes.
+Usage instructions:
+  - See https://github.com/conchoecia/odp#getting-started
+"""
+# odp stuff to format the plot
+import os
+import sys
+# ODP-specific imports
+thisfile_path = os.path.dirname(os.path.realpath(__file__))
+scripts_path = os.path.join(thisfile_path, "../scripts")
+sys.path.insert(1, scripts_path)
+source_path = os.path.join(thisfile_path, "../source")
+sys.path.insert(1, source_path)
+from egt._vendor import odp_plotting_functions as odp_plot
+from egt import rbh_tools
+import argparse
+import pandas as pd
+# matplotlib stuff
+import matplotlib.patches as mpatches
+import matplotlib.pyplot as plt
+def parse_args(argv=None):
+    """
+    The things we need to know are:
+      - -d --rbh_directory - the directory of the rbh files that we will look at
+      - -m --minsig        - the number of the minimum significance value for the whole_FET column in the rbh files. This is used to filter the rbh files.
+      - -a --alg           - the name of the ALG set that we are looking at. This should be something that is the header of the rbh files, for example "BCnSSimakov2022"
+    """
+    parser = argparse.ArgumentParser(description="This program takes in a directory of .rbh files and makes a datastructure of all of the significantly-occurring gene groups on different chromosomes.")
+    parser.add_argument("-d", "--rbh_directory", help="The directory of the rbh files that we will look at")
+    parser.add_argument("-m", "--minsig", type = float, default = 0.005, help="The minimum significance value for the whole_FET column in the rbh files. This is used to filter the rbh files.")
+    parser.add_argument("-a", "--alg", help="The name of the ALG set that we are looking at. This should be something that is the header of the rbh files, for example 'BCnSSimakov2022'")
+    args = parser.parse_args(argv)
+    # Check that the directory exists
+    if not os.path.exists(args.rbh_directory):
+        print("The directory you provided does not exist.")
+    return args
+def plot_chrom_number_vs_number_ALGs_split(ax, splitsdf, min_splits, inferredchromsize):
+    """
+    Saves a pdf of the plot of the number of ALGs split across two or more scaffolds vs the number of chromosomes.
+    returns an axis
+    """
+    # we need to go through and find all the samples that have at least min_splits
+    # first we groupby the sample
+    gb = splitsdf.groupby("sample")
+    entries = []
+    for name, group in gb:
+        # samplename
+        samplename = group["sample"].unique()[0]
+        # get the number of ALGs that are significant at all
+        present_ALG_num = len(group["gene_group"].unique())
+        # num_ALGs on at least n scaffolds
+        num_ALGs = len([x for x in group["gene_group"].value_counts() if x >= min_splits])
+        entries.append({"sample": samplename,
+                        "num_chroms": inferredchromsize[samplename],
+                        "num_ALGs": present_ALG_num,
+                        "num_ALGs_min_splits": num_ALGs})
+    # make a dataframe of all of the entries
+    df = pd.DataFrame(entries)
+    print(df)
+    # for now make a simple scatter plot
+    ax.scatter(df["num_chroms"], df["num_ALGs_min_splits"], alpha = 0.1, lw = 0)
+    ax.set_xlabel("Number of chromosomes")
+    ax.set_ylabel(f"Number of ALGs split across {min_splits} or more scaffolds")
+    return ax
+def plot_chrom_number_vs_number_ALGs_perchrom(ax, splitsdf, inferredchromsize):
+    """
+    For every genome, plots the number of chromosomes (x) vs the number of ALGs on each chromosome (y)
+    For every genome, we will plot every genome.
+    """
+    # we need to go through and find all the samples that have at least min_splits
+    # first we groupby the sample
+    gb = splitsdf.groupby("sample")
+    entries = []
+    for name, group in gb:
+        # samplename
+        samplename = group["sample"].unique()[0]
+        # scafcounts
+        scafcounts = group["scaffold"].value_counts()
+        for chrom in scafcounts.index:
+            entries.append({"sample": samplename,
+                            "chrom": chrom,
+                            "num_chroms": inferredchromsize[samplename],
+                            "ALGs_on_chrom": scafcounts[chrom]})
+    # make a dataframe of all of the entries
+    df = pd.DataFrame(entries)
+    #ax.scatter(df["num_chroms"], df["ALGs_on_chrom"], alpha = 0.01, lw = 0)
+    #ax.set_xlabel("Number of chromosomes")
+    #ax.set_ylabel(f"Number of ALGs on each chromosome")
+    # I don't like this plot.
+    # We do a little more processing, instead try plotting the mean number of ALGs on each chromosome
+    entries = []
+    gb = df.groupby(["sample"])
+    for name, group in gb:
+        entries.append({"sample": name,
+                        "num_chroms": group["num_chroms"].unique()[0],
+                        "mean_ALGs_on_chrom": group["ALGs_on_chrom"].mean()})
+    df = pd.DataFrame(entries)
+    print(df)
+    # for now make a simple scatter plot
+    ax.scatter(df["num_chroms"], df["mean_ALGs_on_chrom"], alpha = 0.1, lw = 0)
+    ax.set_xlabel("Number of chromosomes")
+    ax.set_ylabel(f"Mean number of ALGs on each chromosome")
+    return ax
+def main(argv=None):
+    args = parse_args(argv)
+    rbh_files = [os.path.join(args.rbh_directory, f) for f in os.listdir(args.rbh_directory) if f.endswith(".rbh")]
+    # for testing purposes, just get the top 100 files
+    #rbh_files = rbh_files[100]
+    #rbh_files = [x for x in rbh_files if "Lepisosteus" in x]
+    # we need something of sample to chromnum
+    sample_to_chromnum = {}
+    # now we go through the files
+    entries = []
+    for i in range(len(rbh_files)):
+        # make a counter that goes back to the beginning of the line
+        print(f"\r  analyzing {i+1}/{len(rbh_files)}", end = "")
+        rbhfile = rbh_files[i]
+        rbhdf = rbh_tools.parse_rbh(rbhfile)
+        splitdf, samplename = rbh_tools.rbhdf_to_alglocdf(rbhdf, args.minsig, args.alg)
+        chromnum = rbh_tools.rbh_to_scafnum(rbhdf, samplename)
+        sample_to_chromnum[samplename] = chromnum
+        entries.append(splitdf)
+    print()
+    print(f"\r  Done analyzing {len(rbh_files)}/{len(rbh_files)}", end = "")
+    # make a dataframe of all of the entries
+    splitsdf = pd.concat(entries)
+    # make a plot
+    # CALL THIS TO GET THE VISUAL STYLE WE NEED
+    odp_plot.format_matplotlib()
+    fw = 10
+    fh = 20
+    fig = plt.figure(figsize=(fw, fh))
+    axes = []
+    #for aligning all the panels
+    left1   = 0.6
+    left2   = 6.5
+    left3   = 12.5
+    left4   = 18.5
+    axes = []
+    # This panel is the number of chromosomes vs the number of changes
+    bottom1 = 0.6
+    bottom2 = 7
+    paneldim  = 5
+    # we start with a single panel
+    plot_params = [left1    /fw, # left offset
+                   bottom1  /fh, # bottom offset
+                   paneldim /fw, # width
+                   paneldim /fh] # height
+    axes.append(fig.add_axes(plot_params))
+    axes[-1] = plot_chrom_number_vs_number_ALGs_split(axes[-1], splitsdf,
+                                                      2, sample_to_chromnum)
+    # This panel is the number of chromosomes vs the number of ALGs on each chromosome
+    paneldim  = 5
+    # we start with a single panel
+    plot_params = [left1   /fw, # left offset
+                   bottom2 /fh, # bottom offset
+                   paneldim/fw, # width
+                   paneldim/fh] # height
+    axes.append(fig.add_axes(plot_params))
+    axes[-1] = plot_chrom_number_vs_number_ALGs_perchrom(axes[-1], splitsdf, sample_to_chromnum)
+    outfilename = "ALGs_split_across_two_or_more_scaffolds.pdf"
+    fig.savefig(outfilename, bbox_inches="tight")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())