PyPI - assemblytics - Versions diffs - 2.0.0__py3-none-any.whl - Mend

assemblytics 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

assemblytics/__init__.py +1 -0
assemblytics/cli.py +211 -0
assemblytics/dot_prep.py +430 -0
assemblytics/dotplot.py +116 -0
assemblytics/index.py +188 -0
assemblytics/nchart.py +95 -0
assemblytics/summary.py +147 -0
assemblytics/uniq_anchor.py +357 -0
assemblytics/variant_charts.py +204 -0
assemblytics/variants.py +389 -0
assemblytics-2.0.0.dist-info/METADATA +196 -0
assemblytics-2.0.0.dist-info/RECORD +16 -0
assemblytics-2.0.0.dist-info/WHEEL +5 -0
assemblytics-2.0.0.dist-info/entry_points.txt +2 -0
assemblytics-2.0.0.dist-info/licenses/LICENSE +22 -0
assemblytics-2.0.0.dist-info/top_level.txt +1 -0

assemblytics/uniq_anchor.py ADDED Viewed

@@ -0,0 +1,357 @@
+#!/usr/bin/env python3
+import argparse
+import gzip
+import os
+import time
+import numpy as np
+import operator
+def run(args):
+    filename = args.delta
+    unique_length = args.unique_length
+    output_dir = args.out
+    keep_small_uniques = args.keep_small_uniques
+    if output_dir:
+        os.makedirs(output_dir, exist_ok=True)
+    # if keep_small_uniques:
+    #     print("Keeping fully unique alignments even if they are below the unique anchor length of", unique_length, "bp")
+    # else:
+    #     print("Discarding all alignments below the unique anchor length of", unique_length, "bp")
+    #     print("Use --keep-small-uniques to keep all the fully unique alignments even below this length")
+    # if unique_length == 10000:
+    #     print("Use --unique-length X to set the unique anchor length requirement. Default is 10000, such that each alignment must have at least 10000 bp from the query that are not included in any other alignments.")
+    try:
+        f = gzip.open(filename, 'rt')
+        header1 = f.readline().strip()
+        # Detected gzipped delta file.
+    except:
+        f = open(filename, 'r')
+        header1 = f.readline().strip()
+        # Detected uncompressed delta file.
+    # Skip the second line
+    f.readline()
+    linecounter = 0
+    current_query_name = ""
+    current_header = ""
+    lines_by_query = {}
+    header_lines_by_query = {}
+    before = time.time()
+    last = before
+    existing_query_names = set()
+    for line in f:
+        if line[0]==">":
+            fields = line.strip().split()
+            current_query_name = fields[1]
+            current_header = line.strip()
+            if current_query_name not in existing_query_names:
+                lines_by_query[current_query_name] = []
+                header_lines_by_query[current_query_name] = []
+                existing_query_names.add(current_query_name)
+        else:
+            fields = line.strip().split()
+            if len(fields) > 4:
+                # sometimes start and end are the other way around, but for this they need to be in order
+                query_min = min([int(fields[2]),int(fields[3])])
+                query_max = max([int(fields[2]),int(fields[3])])
+                lines_by_query[current_query_name].append((query_min,query_max))
+                header_lines_by_query[current_query_name].append(current_header)
+    f.close()
+    before = time.time()
+    alignments_to_keep = {}
+    num_queries = len(lines_by_query)
+    num_query_step_to_report = int(num_queries/100)
+    if num_queries < 100:
+        num_query_step_to_report = int(num_queries/10)
+    if num_queries < 10:
+        num_query_step_to_report = 1
+    query_counter = 0
+    for query in lines_by_query:
+        alignments_to_keep[query] = summarize_planesweep(lines_by_query[query], unique_length_required = unique_length,keep_small_uniques=keep_small_uniques)
+        query_counter += 1
+    before = time.time()
+    fout = gzip.open(os.path.join(output_dir, "assemblytics_unique_length_filtered_l%d.delta.gz" % (unique_length)),'wt')
+    try:
+        f = gzip.open(filename, 'rt')
+        header1 = f.readline()
+        # Detected gzipped delta file.
+    except:
+        f = open(filename, 'r')
+        header1 = f.readline()
+        # Detected uncompressed delta file.
+    fout.write(header1)
+    fout.write(f.readline())
+    linecounter = 0
+    # For filtered delta file:
+    list_of_alignments_to_keep = []
+    alignment_counter = {}
+    keep_printing = False
+    # For coords:
+    current_query_name = ""
+    current_query_position = 0
+    fcoords_out_tab = open(os.path.join(output_dir, "assemblytics_coords.tab"),'w')
+    fcoords_out_csv = open(os.path.join(output_dir, "assemblytics_coords.csv"),'w')
+    fcoords_out_csv.write("ref_start,ref_end,query_start,query_end,ref_length,query_length,ref,query,tag\n")
+    # For basic assembly stats:
+    ref_sequences = set()
+    query_sequences = set()
+    ref_lengths = []
+    query_lengths = []
+    # For genome length files (only sequences with at least one unique alignment,
+    # matching what ends up in coords.tab)
+    unique_ref_entries = {}
+    unique_query_entries = {}
+    f_stats_out = open(os.path.join(output_dir, "assemblytics_assembly_stats.txt"),"w")
+    for line in f:
+        linecounter += 1
+        if line[0]==">":
+            fields = line.strip().split()
+            # For delta file output:
+            query = fields[1]
+            list_of_alignments_to_keep = alignments_to_keep[query]
+            header_needed = False
+            for index in list_of_alignments_to_keep:
+                if line.strip() == header_lines_by_query[query][index]:
+                    header_needed = True
+            if header_needed == True:
+                fout.write(line) # if we have any alignments under this header, print the header
+            alignment_counter[query] = alignment_counter.get(query,0)
+            # For coords:
+            current_reference_name = fields[0][1:]
+            current_query_name = fields[1]
+            current_reference_size = int(fields[2])
+            current_query_size = int(fields[3])
+            # For basic assembly stats:
+            if not current_reference_name in ref_sequences:
+                ref_lengths.append(current_reference_size)
+                ref_sequences.add(current_reference_name)
+            if not current_query_name in query_sequences:
+                query_lengths.append(current_query_size)
+                query_sequences.add(current_query_name)
+        else:
+            fields = line.strip().split()
+            if len(fields) > 4:
+                # For coords:
+                ref_start = int(fields[0])
+                ref_end = int(fields[1])
+                query_start = int(fields[2])
+                query_end = int(fields[3])
+                csv_tag = "repetitive"
+                if alignment_counter[query] in list_of_alignments_to_keep:
+                    fout.write(line)
+                    fcoords_out_tab.write("\t".join(map(str,[ref_start,ref_end,query_start, query_end,current_reference_size,current_query_size,current_reference_name,current_query_name])) + "\n")
+                    unique_ref_entries[current_reference_name] = current_reference_size
+                    unique_query_entries[current_query_name] = current_query_size
+                    csv_tag = "unique"
+                    keep_printing = True
+                else:
+                    keep_printing = False
+                fcoords_out_csv.write(",".join(map(str,[ref_start,ref_end,query_start, query_end,current_reference_size,current_query_size,current_reference_name.replace(",","_"),current_query_name.replace(",","_"),csv_tag])) + "\n")
+                alignment_counter[query] = alignment_counter[query] + 1
+            elif keep_printing == True:
+                fout.write(line)
+    fcoords_out_tab.close()
+    fcoords_out_csv.close()
+    with open(os.path.join(output_dir, "assemblytics_ref.genome"), "w") as ref_genome_out:
+        for name, length in sorted(unique_ref_entries.items(), key=lambda item: item[1], reverse=True):
+            ref_genome_out.write("%s\t%d\n" % (name, length))
+    with open(os.path.join(output_dir, "assemblytics_query.genome"), "w") as query_genome_out:
+        for name, length in sorted(unique_query_entries.items(), key=lambda item: item[1], reverse=True):
+            query_genome_out.write("%s\t%d\n" % (name, length))
+    print("Reading file and recording all the entries we decided to keep: %d seconds for %d total lines in file" % (time.time()-before,linecounter))
+    ref_lengths.sort()
+    query_lengths.sort()
+    # Assembly statistics
+    ref_lengths = np.array(ref_lengths)
+    query_lengths = np.array(query_lengths)
+    f_stats_out.write("Reference: %s\n" % (header1.split()[0].split("/")[-1]))
+    f_stats_out.write( "Number of sequences: %s\n" % intWithCommas(len(ref_lengths)))
+    f_stats_out.write( "Total sequence length: %s\n" %  gig_meg(sum(ref_lengths)))
+    f_stats_out.write( "Mean: %s\n" % gig_meg(np.mean(ref_lengths)))
+    f_stats_out.write( "Min: %s\n" % gig_meg(np.min(ref_lengths)))
+    f_stats_out.write( "Max: %s\n" % gig_meg(np.max(ref_lengths)))
+    f_stats_out.write( "N50: %s\n" % gig_meg(N50(ref_lengths)))
+    f_stats_out.write( "\n\n")
+    f_stats_out.write( "Query: %s\n" % header1.split()[1].split("/")[-1])
+    f_stats_out.write( "Number of sequences: %s\n" % intWithCommas(len(query_lengths)))
+    f_stats_out.write( "Total sequence length: %s\n" % gig_meg(sum(query_lengths)))
+    f_stats_out.write( "Mean: %s\n" % gig_meg(np.mean(query_lengths)))
+    f_stats_out.write( "Min: %s\n" % gig_meg(np.min(query_lengths)))
+    f_stats_out.write( "Max: %s\n" % gig_meg(np.max(query_lengths)))
+    f_stats_out.write( "N50: %s\n" % gig_meg(N50(query_lengths)))
+    f.close()
+    fout.close()
+    f_stats_out.close()
+def N50(sorted_list):
+    # List should be sorted as increasing
+    # We flip the list around here so we start with the largest element
+    cumsum = 0
+    for length in sorted_list[::-1]:
+        cumsum += length
+        if cumsum >= sum(sorted_list)/2:
+            return length
+def gig_meg(number,digits = 2):
+    gig = 1000000000.
+    meg = 1000000.
+    kil = 1000.
+    if number > gig:
+        return str(round(number/gig,digits)) + " Gbp"
+    elif number > meg:
+        return str(round(number/meg,digits)) + " Mbp"
+    elif number > kil:
+        return str(round(number/kil,digits)) + " Kbp"
+    else:
+        return str(number) + " bp"
+def intWithCommas(x):
+    if type(x) != int:
+        raise TypeError("Parameter must be an integer.")
+    if x < 0:
+        return '-' + intWithCommas(-x)
+    result = ''
+    while x >= 1000:
+        x, r = divmod(x, 1000)
+        result = ",%03d%s" % (r, result)
+    return "%d%s" % (x, result)
+def summarize_planesweep(lines,unique_length_required, keep_small_uniques=False):
+    alignments_to_keep = []
+    # If no alignments:
+    if len(lines)==0:
+        return []
+    # If only one alignment:
+    if len(lines) == 1:
+        if keep_small_uniques == True or abs(lines[0][1] - lines[0][0]) >= unique_length_required:
+            return [0]
+        else:
+            return []
+    starts_and_stops = []
+    for query_min,query_max in lines:
+        starts_and_stops.append((query_min,"start"))
+        starts_and_stops.append((query_max,"stop"))
+    sorted_starts_and_stops = sorted(starts_and_stops,key=operator.itemgetter(0))
+    current_coverage = 0
+    last_position = -1
+    sorted_unique_intervals_left = []
+    sorted_unique_intervals_right = []
+    for pos,change in sorted_starts_and_stops:
+        if current_coverage == 1:
+            sorted_unique_intervals_left.append(last_position)
+            sorted_unique_intervals_right.append(pos)
+        if change == "start":
+            current_coverage += 1
+        else:
+            current_coverage -= 1
+        last_position = pos
+    linecounter = 0
+    for query_min,query_max in lines:
+        i = binary_search(query_min,sorted_unique_intervals_left,0,len(sorted_unique_intervals_left))
+        exact_match = False
+        if sorted_unique_intervals_left[i] == query_min and sorted_unique_intervals_right[i] == query_max:
+            exact_match = True
+        sum_uniq = 0
+        while i < len(sorted_unique_intervals_left) and sorted_unique_intervals_left[i] >= query_min and sorted_unique_intervals_right[i] <= query_max:
+            sum_uniq += sorted_unique_intervals_right[i] - sorted_unique_intervals_left[i]
+            i += 1
+        if sum_uniq >= unique_length_required:
+            alignments_to_keep.append(linecounter)
+        elif keep_small_uniques == True and exact_match == True:
+            alignments_to_keep.append(linecounter)
+        linecounter += 1
+    return alignments_to_keep
+def binary_search(query, numbers, left, right):
+    #  Returns index of the matching element or the first element to the right
+    if left >= right:
+        return right
+    mid = int((right+left)/2)
+    if query == numbers[mid]:
+        return mid
+    elif query < numbers[mid]:
+        return binary_search(query,numbers,left,mid)
+    else: # if query > numbers[mid]:
+        return binary_search(query,numbers,mid+1,right)
+def main():
+    parser=argparse.ArgumentParser(description="Filters alignments in delta file based on whether each alignment has a unique sequence anchoring it")
+    parser.add_argument("--delta",help="delta file" ,dest="delta", type=str, required=True)
+    parser.add_argument("--out",help="output directory for assemblytics_* files (default: current directory)" ,dest="out", type=str, default=".")
+    parser.add_argument("--unique-length",help="The total length of unique sequence an alignment must have on the query side to be retained. Default: 10000" ,dest="unique_length",type=int, default=10000)
+    parser.add_argument("--keep-small-uniques",help="Keep small aligments (below the unique anchor length) if they are completely unique without any part of the alignment mapping multiple places" ,dest="keep_small_uniques",action="store_true")
+    parser.set_defaults(func=run)
+    args=parser.parse_args()
+    args.func(args)
+if __name__=="__main__":
+    main()

assemblytics/variant_charts.py ADDED Viewed

@@ -0,0 +1,204 @@
+#!/usr/bin/env python3
+import sys
+import pandas as pd
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib")
+def comma_format(num):
+    return "{:,}".format(int(abs(num)))
+def run(output_dir, abs_min_var, abs_max_var):
+    filename = os.path.join(output_dir, "assemblytics_structural_variants.bed")
+    if not os.path.exists(filename):
+        print(f"File {filename} not found.")
+        return
+    try:
+        bed = pd.read_csv(filename, sep="\t")
+    except Exception as e:
+        print(f"Error reading {filename}: {e}")
+        return
+    if bed.empty:
+        print("No variants found in BED file.")
+        return
+    # Rename columns to match R script expectations
+    expected_cols = ["chrom", "start", "stop", "name", "size", "strand", "type", "ref_dist", "query_dist", "contig_position", "method_found"]
+    bed.columns = expected_cols[:len(bed.columns)]
+    # Revalue types
+    type_map = {
+        "Repeat_expansion": "Repeat expansion",
+        "Repeat_contraction": "Repeat contraction",
+        "Tandem_expansion": "Tandem expansion",
+        "Tandem_contraction": "Tandem contraction"
+    }
+    bed['type'] = bed['type'].replace(type_map)
+    types_allowed = ["Insertion", "Deletion", "Repeat expansion", "Repeat contraction", "Tandem expansion", "Tandem contraction"]
+    # Filter for allowed types and set as categorical for consistent ordering
+    bed = bed[bed['type'].isin(types_allowed)]
+    bed['type'] = pd.Categorical(bed['type'], categories=types_allowed, ordered=True)
+    # Color palette (Set1 from RColorBrewer: [1,2,3,4,5,7,8])
+    # Set1 hex colors: #E41A1C, #377EB8, #4DAF4A, #984EA3, #FF7F00, #A65628
+    # R big_palette<-brewer.pal(9,"Set1")[c(1,2,3,4,5,7)] was actually using 7th which is pink.
+    # User said Set1[8] in python instead of 7 for brown.
+    # Set1 colors: 1:red, 2:blue, 3:green, 4:purple, 5:orange, 6:yellow, 7:brown, 8:pink, 9:grey
+    # Actually brewer.pal(9, "Set1") is:
+    # 1: #E41A1C (red)
+    # 2: #377EB8 (blue)
+    # 3: #4DAF4A (green)
+    # 4: #984EA3 (purple)
+    # 5: #FF7F00 (orange)
+    # 6: #FFFF33 (yellow)
+    # 7: #A65628 (brown)
+    # 8: #F781BF (pink)
+    # 9: #999999 (grey)
+    # The user says Set1[8] for brown. In R indexing starts at 1.
+    # Wait, R brewer.pal(9, "Set1")[7] is brown (#A65628).
+    # If the user says R is 1-indexed and they want Set1[8] in python... maybe they meant the 8th color in Set1 is brown?
+    # Actually in Set1, 7 is brown and 8 is pink.
+    # If the previous code used pink (#F781BF) and the user wants brown, brown is #A65628.
+    big_palette = ["#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", "#A65628"]
+    # Prep data for log-scaled plot
+    alt = bed.copy()
+    contraction_types = ["Deletion", "Repeat contraction", "Tandem contraction"]
+    alt.loc[alt['type'].isin(contraction_types), 'size'] = -1 * alt.loc[alt['type'].isin(contraction_types), 'size']
+    alt['Type'] = "None"
+    alt.loc[alt['type'].isin(["Insertion", "Deletion"]), 'Type'] = "Indel"
+    alt.loc[alt['type'].isin(["Tandem expansion", "Tandem contraction"]), 'Type'] = "Tandem"
+    alt.loc[alt['type'].isin(["Repeat expansion", "Repeat contraction"]), 'Type'] = "Repeat"
+    # User requested order: Indel, Repeat, Tandem
+    alt['Type'] = pd.Categorical(alt['Type'], categories=["Indel", "Repeat", "Tandem"], ordered=True)
+    # Size cutoffs
+    var_size_cutoffs = sorted(list(set([abs_min_var, 10, 50, 500, abs_max_var])))
+    var_size_cutoffs = [x for x in var_size_cutoffs if x >= abs_min_var and x <= abs_max_var]
+    for i in range(len(var_size_cutoffs) - 1):
+        min_var = var_size_cutoffs[i]
+        max_var = var_size_cutoffs[i+1]
+        if min_var < abs_max_var and max_var > abs_min_var:
+            filtered_bed = bed[(bed['size'] >= min_var) & (bed['size'] <= max_var)]
+            if not filtered_bed.empty:
+                binwidth = max(1, (max_var - min_var) / 100)
+                bins = np.arange(min_var, max_var + binwidth, binwidth)
+                # Calculate global max for y-axis synchronization
+                max_counts = []
+                for t in types_allowed:
+                    data = filtered_bed[filtered_bed['type'] == t]['size']
+                    if not data.empty:
+                        counts, _ = np.histogram(data, bins=bins)
+                        max_counts.append(max(counts))
+                global_max = max(max_counts) if max_counts else 10
+                fig, axes = plt.subplots(nrows=len(types_allowed), ncols=1, figsize=(8, 10), sharex=True)
+                fig.suptitle(f"Variants {comma_format(min_var)} to {comma_format(max_var)} bp", fontsize=16)
+                for j, t in enumerate(types_allowed):
+                    ax = axes[j]
+                    data = filtered_bed[filtered_bed['type'] == t]['size']
+                    ax.hist(data, bins=bins, color=big_palette[j], label=t)
+                    ax.set_ylabel("Count", fontsize=8)
+                    ax.tick_params(axis='both', which='major', labelsize=8)
+                    ax.set_ylim(0, global_max * 1.1) # Add 10% padding
+                    # Remove right and top spines
+                    ax.spines['right'].set_visible(False)
+                    ax.spines['top'].set_visible(False)
+                    # Add type label inside the plot, moved up to avoid data
+                    ax.text(0.98, 0.85, t, transform=ax.transAxes, horizontalalignment='right', verticalalignment='top', fontsize=10, fontweight='bold')
+                plt.xlabel("Variant size", fontsize=12)
+                plt.tight_layout(rect=[0, 0.03, 1, 0.95])
+                for fmt in ['png', 'pdf']:
+                    plt.savefig(os.path.join(output_dir, f"assemblytics_size_distributions_{min_var}-{max_var}.{fmt}"), dpi=200)
+                plt.close()
+            else:
+                print(f"No variants in plot: min_var={min_var}, max_var={max_var}")
+    # Log-scaled plot
+    fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(10, 8), sharex=True)
+    fig.suptitle(f"Variants {comma_format(abs_min_var)} to {comma_format(abs_max_var)} bp", fontsize=16)
+    # User requested order: Indel, Repeat, Tandem
+    categories_ordered = ["Indel", "Repeat", "Tandem"]
+    types_by_category = {
+        "Indel": ["Insertion", "Deletion"],
+        "Repeat": ["Repeat expansion", "Repeat contraction"],
+        "Tandem": ["Tandem expansion", "Tandem contraction"]
+    }
+    binwidth = (2 * abs_max_var) / 100
+    bins = np.arange(-abs_max_var, abs_max_var + binwidth, binwidth)
+    # Calculate global max for y-axis synchronization in log scale
+    max_counts_log = []
+    for category in categories_ordered:
+        cat_data = alt[alt['Type'] == category]
+        if not cat_data.empty:
+            # We want to show counts + 1 to make small counts visible on log scale
+            counts, _ = np.histogram(cat_data['size'], bins=bins)
+            max_counts_log.append(max(counts) + 1)
+    global_max_log = max(max_counts_log) if max_counts_log else 100
+    for i, category in enumerate(categories_ordered):
+        ax = axes[i]
+        for t in types_by_category[category]:
+            color_idx = types_allowed.index(t)
+            data = alt[alt['type'] == t]['size']
+            if not data.empty:
+                # Use np.histogram and plt.bar to manually implement count + 1 for log scale
+                counts, bin_edges = np.histogram(data, bins=bins)
+                # To match R's log(count + 1), we plot bars of height counts + 1
+                # But we need to handle the bottom of the log scale.
+                # Actually, a better way to match R exactly is to plot counts + 1 and set ylim bottom to 1.
+                bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
+                ax.bar(bin_centers, counts + 1, width=binwidth, color=big_palette[color_idx], label=t, alpha=0.7)
+        ax.set_yscale('log')
+        ax.set_ylabel("Log(count + 1)", fontsize=10)
+        ax.tick_params(axis='both', which='major', labelsize=8)
+        ax.spines['right'].set_visible(False)
+        ax.spines['top'].set_visible(False)
+        ax.set_ylim(1, global_max_log * 1.5)
+        # Add category label
+        ax.text(0.02, 0.85, category, transform=ax.transAxes, horizontalalignment='left', fontsize=12, fontweight='bold')
+        handles, labels = ax.get_legend_handles_labels()
+        if handles:
+            ax.legend(handles, labels, loc='upper right', fontsize=8)
+    plt.xlabel("Variant size", fontsize=12)
+    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
+    for fmt in ['png', 'pdf']:
+        plt.savefig(os.path.join(output_dir, f"assemblytics_size_distributions_log.{fmt}"), dpi=200)
+    plt.close()
+if __name__ == "__main__":
+    if len(sys.argv) < 4:
+        print("Usage: variant_charts.py output_dir abs_min_var abs_max_var")
+        sys.exit(1)
+    output_dir = sys.argv[1]
+    abs_min_var = int(sys.argv[2])
+    abs_max_var = int(sys.argv[3])
+    run(output_dir, abs_min_var, abs_max_var)