assemblytics 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,116 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import sys
4
+ import pandas as pd
5
+ import matplotlib
6
+ matplotlib.use('Agg')
7
+ import matplotlib.pyplot as plt
8
+ import numpy as np
9
+ import os
10
+
11
+ def run(output_dir):
12
+ filename = os.path.join(output_dir, "assemblytics_oriented_coords.csv")
13
+ plot_output_filename = os.path.join(output_dir, "assemblytics_dotplot_filtered")
14
+ plot_title = "Dot plot of Assemblytics filtered alignments"
15
+
16
+ if not os.path.exists(filename):
17
+ print(f"File {filename} not found.")
18
+ return
19
+
20
+ coords = pd.read_csv(filename)
21
+
22
+ if len(coords) > 100000:
23
+ coords = coords.head(100000)
24
+
25
+ coords['ref'] = coords['ref'].astype(str)
26
+ coords['query'] = coords['query'].astype(str)
27
+
28
+ # Ordering chromosomes
29
+ ordered_common_names = [str(i) for i in range(1, 101)] + \
30
+ [f"chr{i}" for i in range(1, 101)] + \
31
+ [f"Chr{i}" for i in range(1, 101)] + \
32
+ ["X", "Y", "M", "MT", "Chr0", "chr0", "0"]
33
+
34
+ unique_refs = coords['ref'].unique()
35
+ all_refs_ordered = [r for r in ordered_common_names if r in unique_refs] + \
36
+ [r for r in unique_refs if r not in ordered_common_names]
37
+
38
+ coords['ref'] = pd.Categorical(coords['ref'], categories=all_refs_ordered, ordered=True)
39
+ coords = coords.sort_values('ref')
40
+
41
+ # Get chromosome lengths and calculate offsets
42
+ chr_lengths = coords.groupby('ref', observed=False)['ref_length'].max().reindex(all_refs_ordered).fillna(0)
43
+ chr_offsets = chr_lengths.cumsum().shift(1).fillna(0)
44
+
45
+ def get_ref_loc(chrom, pos):
46
+ return chr_offsets[chrom] + pos
47
+
48
+ coords['ref_loc_start'] = coords.apply(lambda row: get_ref_loc(row['ref'], row['ref_start']), axis=1)
49
+ coords['ref_loc_stop'] = coords.apply(lambda row: get_ref_loc(row['ref'], row['ref_end']), axis=1)
50
+
51
+ # Calculate alignment length for query ordering
52
+ coords['alignment_length'] = abs(coords['query_start'] - coords['query_end'])
53
+
54
+ # Pick longest alignment for each query to decide query ordering
55
+ longest_alignments = coords.loc[coords.groupby('query')['alignment_length'].idxmax()]
56
+ ordered_queries = longest_alignments.sort_values('ref_loc_start')['query'].tolist()
57
+
58
+ # Get query lengths and calculate offsets
59
+ query_lengths = coords.groupby('query')['query_length'].max().reindex(ordered_queries).fillna(0)
60
+ query_offsets = query_lengths.cumsum().shift(1).fillna(0)
61
+
62
+ def get_query_loc(query, pos):
63
+ return query_offsets[query] + pos
64
+
65
+ coords['query_loc_start'] = coords.apply(lambda row: get_query_loc(row['query'], row['query_start']), axis=1)
66
+ coords['query_loc_stop'] = coords.apply(lambda row: get_query_loc(row['query'], row['query_end']), axis=1)
67
+
68
+ # Labels (hide for small chromosomes/queries)
69
+ total_ref_length = chr_lengths.sum()
70
+ chr_labels = [name if length >= 0.02 * total_ref_length else "" for name, length in chr_lengths.items()]
71
+ chr_breaks = chr_lengths.cumsum().tolist()
72
+
73
+ total_query_length = query_lengths.sum()
74
+ query_labels = [name if length >= 0.02 * total_query_length else "" for name, length in query_lengths.items()]
75
+ query_breaks = query_lengths.cumsum().tolist()
76
+
77
+ # Plotting
78
+ plt.figure(figsize=(10, 10))
79
+
80
+ colors = {"unique": "black", "repetitive": "red"}
81
+
82
+ for tag in ["unique", "repetitive"]:
83
+ df = coords[coords['tag'] == tag]
84
+ if not df.empty:
85
+ for _, row in df.iterrows():
86
+ plt.plot([row['ref_loc_start'], row['ref_loc_stop']],
87
+ [row['query_loc_start'], row['query_loc_stop']],
88
+ color=colors[tag], linewidth=1.5, solid_capstyle='butt')
89
+
90
+ plt.title(plot_title, fontsize=16)
91
+ plt.xlabel("Reference", fontsize=14)
92
+ plt.ylabel("Query", fontsize=14)
93
+
94
+ plt.xticks(chr_breaks, chr_labels, rotation=90, fontsize=8)
95
+ plt.yticks(query_breaks, query_labels, fontsize=8)
96
+
97
+ plt.xlim(0, total_ref_length)
98
+ plt.ylim(0, total_query_length)
99
+
100
+ plt.grid(True, linestyle='-', linewidth=0.1, color='black')
101
+
102
+ # Custom legend
103
+ from matplotlib.lines import Line2D
104
+ legend_elements = [Line2D([0], [0], color='black', lw=2, label='unique'),
105
+ Line2D([0], [0], color='red', lw=2, label='repetitive')]
106
+ plt.legend(handles=legend_elements, title="Filter", loc='upper right')
107
+
108
+ plt.tight_layout()
109
+ plt.savefig(plot_output_filename + ".png", dpi=200)
110
+ plt.close()
111
+
112
+ if __name__ == "__main__":
113
+ if len(sys.argv) < 2:
114
+ print("Usage: dotplot.py output_dir")
115
+ sys.exit(1)
116
+ run(sys.argv[1])
assemblytics/index.py ADDED
@@ -0,0 +1,188 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import argparse
4
+ import os
5
+ import numpy as np
6
+ import re
7
+
8
+
9
+ def run(args):
10
+
11
+ coords = args.coords
12
+ output_dir = args.out
13
+ if output_dir:
14
+ os.makedirs(output_dir, exist_ok=True)
15
+
16
+ f = open(coords)
17
+ f.readline() # ignore header
18
+
19
+ fields_by_query = {}
20
+ existing_query_names = set()
21
+ existing_reference_names = set()
22
+ reference_lengths = []
23
+ query_lengths = {}
24
+ for line in f:
25
+ fields = line.strip().split(",")
26
+ query_name = fields[7]
27
+ query_lengths[query_name] = int(fields[5])
28
+ if not query_name in existing_query_names:
29
+ fields_by_query[query_name] = []
30
+ existing_query_names.add(query_name)
31
+ fields_by_query[query_name].append(fields)
32
+
33
+ ref_name = fields[6]
34
+ ref_length = int(fields[4])
35
+ if not ref_name in existing_reference_names:
36
+ existing_reference_names.add(ref_name)
37
+ reference_lengths.append((ref_name,ref_length))
38
+
39
+ f.close()
40
+
41
+
42
+ # Find the order of the reference chromosomes
43
+ reference_lengths.sort(key=lambda x: natural_key(x[0]))
44
+
45
+ # Find the cumulative sums
46
+ cumulative_sum = 0
47
+ ref_chrom_offsets = {}
48
+ queries_by_reference = {}
49
+ for ref,ref_length in reference_lengths:
50
+ ref_chrom_offsets[ref] = cumulative_sum
51
+ cumulative_sum += ref_length
52
+ queries_by_reference[ref] = set()
53
+
54
+ # Calculate relative positions of each alignment in this cumulative length, and take the median of these for each query, then sort the queries by those scores
55
+ flip_by_query = {}
56
+ references_by_query = {} # for index
57
+ relative_ref_position_by_query = [] # for ordering
58
+
59
+
60
+ for query_name in fields_by_query:
61
+ lines = fields_by_query[query_name]
62
+ sum_forward = 0
63
+ sum_reverse = 0
64
+ amount_of_reference = {}
65
+ ref_position_scores = []
66
+ references_by_query[query_name] = set()
67
+ for ref,ref_length in reference_lengths:
68
+ amount_of_reference[ref] = 0
69
+ for fields in lines:
70
+ tag = fields[8]
71
+ if tag == "unique":
72
+ query_stop = int(fields[3])
73
+ query_start = int(fields[2])
74
+ ref_start = int(fields[0])
75
+ ref_stop = int(fields[1])
76
+ alignment_length = abs(int(fields[3])-int(fields[2]))
77
+ ref = fields[6]
78
+
79
+ # for index:
80
+ references_by_query[query_name].add(ref)
81
+ queries_by_reference[ref].add(query_name)
82
+ # amount_of_reference[ref] += alignment_length
83
+
84
+ # for ordering:
85
+ ref_position_scores.append(ref_chrom_offsets[ref] + (ref_start+ref_stop)/2)
86
+
87
+ # for orientation:
88
+ if query_stop < query_start:
89
+ sum_reverse += alignment_length
90
+ else:
91
+ sum_forward += alignment_length
92
+ # orientation:
93
+ flip_by_query[query_name] = sum_reverse > sum_forward
94
+
95
+ # ordering
96
+ if len(ref_position_scores) > 0:
97
+ relative_ref_position_by_query.append((query_name,np.median(ref_position_scores)))
98
+ else:
99
+ relative_ref_position_by_query.append((query_name,0))
100
+
101
+
102
+ relative_ref_position_by_query.sort(key=lambda x: x[1])
103
+
104
+ fout_ref_index = open(os.path.join(output_dir, "assemblytics_ref_index.csv"),'w')
105
+ fout_ref_index.write("ref,ref_length,matching_queries\n")
106
+
107
+ # reference_lengths is sorted by the reference chromosome name
108
+ for ref,ref_length in reference_lengths:
109
+ fout_ref_index.write("%s,%d,%s\n" % (ref,ref_length,"~".join(queries_by_reference[ref])))
110
+ fout_ref_index.close()
111
+
112
+ fout_query_index = open(os.path.join(output_dir, "assemblytics_query_index.csv"),'w')
113
+ fout_query_index.write("query,query_length,matching_refs\n")
114
+
115
+ # relative_ref_position_by_query is sorted by rel_pos
116
+ for query,rel_pos in relative_ref_position_by_query:
117
+ fout_query_index.write("%s,%d,%s\n" % (query,query_lengths[query],"~".join(references_by_query[query])))
118
+ fout_query_index.close()
119
+
120
+
121
+ f = open(coords)
122
+ fout = open(os.path.join(output_dir, "assemblytics_oriented_coords.csv"),'w')
123
+ header = f.readline().strip()
124
+ fout.write(header+",alignment_length\n") # copy the header
125
+
126
+ alignment_length_column = len(header.split(","))
127
+
128
+ uniques = []
129
+ repetitives = []
130
+
131
+ for line in f:
132
+ fields = line.strip().split(",")
133
+ query_name = fields[7]
134
+ if flip_by_query[query_name] == True:
135
+ fields[2] = int(fields[5]) - int(fields[2])
136
+ fields[3] = int(fields[5]) - int(fields[3])
137
+ alignment_length = abs(int(fields[2]) - int(fields[1]))
138
+ else:
139
+ alignment_length = abs(int(fields[3]) - int(fields[2]))
140
+ fields.append(alignment_length)
141
+ if fields[8] == "unique":
142
+ uniques.append(fields)
143
+ else:
144
+ repetitives.append(fields)
145
+ f.close()
146
+
147
+ uniques.sort(key=lambda x: x[alignment_length_column],reverse=True)
148
+ repetitives.sort(key=lambda x: x[alignment_length_column],reverse=True)
149
+
150
+ fout_info = open(os.path.join(output_dir, "assemblytics_info.csv"),'w')
151
+ fout_info.write("key,value\n")
152
+ fout_info.write("unique alignments,%d\n" % len(uniques))
153
+ fout_info.write("repetitive alignments,%d\n" % len(repetitives))
154
+
155
+
156
+ for fields in uniques:
157
+ fout.write(",".join(map(str,fields)) + "\n")
158
+
159
+ if len(repetitives) < 100000:
160
+ for fields in repetitives:
161
+ fout.write(",".join(map(str,fields)) + "\n")
162
+ fout_info.write("showing repetitive alignments,True\n")
163
+ else:
164
+ fout_repeats = open(os.path.join(output_dir, "assemblytics_oriented_coords_repetitive.csv"),'w')
165
+ fout_repeats.write(header+",alignment_length\n") # copy the header
166
+ for fields in repetitives:
167
+ fout_repeats.write(",".join(map(str,fields)) + "\n")
168
+ fout_repeats.close()
169
+ fout_info.write("showing repetitive alignments,False: Too many\n")
170
+
171
+ fout.close()
172
+ fout_info.close()
173
+
174
+ def natural_key(string_):
175
+ """See http://www.codinghorror.com/blog/archives/001018.html"""
176
+ return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)]
177
+
178
+
179
+ def main():
180
+ parser=argparse.ArgumentParser(description="Index and orient a coordinate file for dotplots.")
181
+ parser.add_argument("-coords",help="coords.csv file from uniq_anchor.py" ,dest="coords", type=str, required=True)
182
+ parser.add_argument("-out",help="output directory for assemblytics_* index and oriented coordinates files (default: current directory)" ,dest="out", type=str, default=".")
183
+ parser.set_defaults(func=run)
184
+ args=parser.parse_args()
185
+ args.func(args)
186
+
187
+ if __name__=="__main__":
188
+ main()
assemblytics/nchart.py ADDED
@@ -0,0 +1,95 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import sys
4
+ import pandas as pd
5
+ import matplotlib
6
+ matplotlib.use('Agg')
7
+ import matplotlib.pyplot as plt
8
+ import numpy as np
9
+ import os
10
+
11
+ def bp_format(num):
12
+ if num > 1000000000:
13
+ return "{:,.3f} Gbp".format(num / 1000000000).rstrip('0').rstrip('.')
14
+ elif num > 1000000:
15
+ return "{:,.3f} Mbp".format(num / 1000000).rstrip('0').rstrip('.')
16
+ elif num > 1000:
17
+ return "{:,.3f} Kbp".format(num / 1000).rstrip('0').rstrip('.')
18
+ else:
19
+ return "{:,} bp".format(int(num))
20
+
21
+ def run(output_dir):
22
+ ref_genome = os.path.join(output_dir, "assemblytics_ref.genome")
23
+ query_genome = os.path.join(output_dir, "assemblytics_query.genome")
24
+ if not os.path.exists(ref_genome) or not os.path.exists(query_genome):
25
+ print(f"File {ref_genome} or {query_genome} not found.")
26
+ return
27
+
28
+ try:
29
+ ref_data = pd.read_csv(ref_genome, sep="\t", header=None, names=["name", "length"])
30
+ query_data = pd.read_csv(query_genome, sep="\t", header=None, names=["name", "length"])
31
+ except Exception as e:
32
+ print(f"Error reading {ref_genome} or {query_genome}: {e}")
33
+ return
34
+
35
+ ref_data = ref_data.sort_values("length", ascending=False)
36
+ query_data = query_data.sort_values("length", ascending=False)
37
+
38
+ genome_length = max(ref_data["length"].sum(), query_data["length"].sum())
39
+
40
+ # Calculate cumulative distributions
41
+ ref_cumsum = pd.DataFrame({
42
+ "NG": (ref_data["length"].cumsum() / genome_length * 100),
43
+ "contig_length": ref_data["length"],
44
+ "contig_source": "Reference"
45
+ })
46
+
47
+ query_cumsum = pd.DataFrame({
48
+ "NG": (query_data["length"].cumsum() / genome_length * 100),
49
+ "contig_length": query_data["length"],
50
+ "contig_source": "Query"
51
+ })
52
+
53
+ both_plot = pd.concat([ref_cumsum, query_cumsum])
54
+
55
+ # Add zeros for the start of the plot
56
+ ref_cumsum_0 = pd.concat([pd.DataFrame({"NG": [0], "contig_length": [ref_cumsum["contig_length"].max()], "contig_source": ["Reference"]}), ref_cumsum])
57
+ query_cumsum_0 = pd.concat([pd.DataFrame({"NG": [0], "contig_length": [query_cumsum["contig_length"].max()], "contig_source": ["Query"]}), query_cumsum])
58
+
59
+ with_zeros = pd.concat([ref_cumsum_0, query_cumsum_0])
60
+
61
+ plt.figure(figsize=(8, 8))
62
+ colors = {"Reference": "limegreen", "Query": "blue"}
63
+
64
+ if len(with_zeros) > 2:
65
+ for source in ["Reference", "Query"]:
66
+ data = with_zeros[with_zeros["contig_source"] == source]
67
+ plt.step(data["NG"], data["contig_length"], where='post', color=colors[source], label=source, linewidth=1.5, alpha=0.5)
68
+
69
+ points = both_plot[both_plot["contig_source"] == source]
70
+ plt.scatter(points["NG"], points["contig_length"], color=colors[source], s=20, alpha=0.5)
71
+ else:
72
+ for source in ["Reference", "Query"]:
73
+ points = both_plot[both_plot["contig_source"] == source]
74
+ plt.scatter(points["NG"], points["contig_length"], color=colors[source], s=40, alpha=0.5, label=source)
75
+
76
+ plt.yscale('log')
77
+ plt.xlim(0, 100)
78
+ plt.ylim(1, genome_length * 1.1)
79
+
80
+ plt.xlabel(f"NG(x)% where 100% = {bp_format(genome_length)}")
81
+ plt.ylabel("Sequence length")
82
+ plt.title("Cumulative sequence length")
83
+ plt.legend(title="Assembly")
84
+ plt.grid(True, which="both", ls="-", alpha=0.2)
85
+
86
+ plt.tight_layout()
87
+ for fmt in ['png', 'pdf']:
88
+ plt.savefig(os.path.join(output_dir, f"assemblytics_nchart.{fmt}"), dpi=200)
89
+ plt.close()
90
+
91
+ if __name__ == "__main__":
92
+ if len(sys.argv) < 2:
93
+ print("Usage: nchart.py output_dir")
94
+ sys.exit(1)
95
+ run(sys.argv[1])
@@ -0,0 +1,147 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import argparse
4
+ import os
5
+ import numpy as np
6
+
7
+ def SVtable(args):
8
+ filename = args.file
9
+ minimum_variant_size = args.minimum_variant_size
10
+ maximum_variant_size = args.maximum_variant_size
11
+ simplify_types = False
12
+
13
+ f=open(filename)
14
+ typeList = []
15
+ sizeList = []
16
+ rawTypes = []
17
+ linecounter = 0
18
+ for line in f:
19
+ fields = line.strip().split()
20
+ if not fields[4].isdigit():
21
+ continue
22
+ svType = fields[6]
23
+ rawTypes.append(svType)
24
+ if simplify_types == True:
25
+ if svType == "Insertion" or svType == "Expansion":
26
+ typeList.append("Insertion/Expansion")
27
+ elif svType == "Deletion" or svType == "Contraction":
28
+ typeList.append("Deletion/Contraction")
29
+ else:
30
+ typeList.append(svType)
31
+ else:
32
+ typeList.append(svType)
33
+ sizeList.append(int(fields[4]))
34
+ linecounter += 1
35
+ f.close()
36
+
37
+ size_thresholds = [10,50,500,10000,50000,100000,500000,1000000]
38
+
39
+ sizeArray = np.array(sizeList)
40
+ typeArray = np.array(typeList)
41
+ svTypes = ["Insertion","Deletion","Repeat_expansion","Repeat_contraction","Tandem_expansion","Tandem_contraction"]
42
+ if simplify_types == True:
43
+ svTypes = ["Insertion/Expansion","Deletion/Contraction"]
44
+ overall_total = 0
45
+ overall_total_bases = 0
46
+ overall_total_SVs = 0
47
+ overall_total_SV_bases = 0
48
+
49
+ SV_size = 50
50
+
51
+ all_SV_types = svTypes + list(set(rawTypes)-set(svTypes))
52
+
53
+ base, _ = os.path.splitext(filename)
54
+ f_output_csv = open(base + "_summary.csv",'w')
55
+
56
+ if linecounter > 0:
57
+ for svType in all_SV_types:
58
+ sizes = sizeArray[typeArray==svType]
59
+ overall_total += len(sizes)
60
+ overall_total_bases += sum(sizes)
61
+ overall_total_SVs += len(sizes[sizes>=SV_size])
62
+ overall_total_SV_bases += sum(sizes[sizes>=SV_size])
63
+ print(svType)
64
+ f_output_csv.write(svType + "\n")
65
+
66
+ format = "%20s%10s%15s"
67
+
68
+ print(format % ("", "Count","Total bp"))
69
+ f_output_csv.write("Size range,Count,Total bp\n")
70
+
71
+ previous_size = minimum_variant_size
72
+ for threshold in size_thresholds:
73
+ if threshold <= minimum_variant_size or previous_size >= maximum_variant_size:
74
+ continue
75
+ subset = sizes[np.logical_and(sizes>=previous_size,sizes<threshold)];
76
+ print(format % ("%s-%s bp: " % (intWithCommas(previous_size),intWithCommas(threshold)), str(len(subset)), str(sum(subset))))
77
+ f_output_csv.write("%s,%s,%s\n" % ("%s-%s bp" % (previous_size,threshold), str(len(subset)), str(sum(subset))))
78
+ previous_size = threshold
79
+
80
+ if previous_size < maximum_variant_size:
81
+ subset = sizes[sizes>=previous_size];
82
+ print(format % ("> %s bp: " % (intWithCommas(previous_size)), str(len(subset)), str(sum(subset))))
83
+ f_output_csv.write("%s,%s,%s\n" % ("> %s bp" % (previous_size), str(len(subset)), str(sum(subset))))
84
+
85
+ print(format % ("Total: ",str(len(sizes)),str(sum(sizes))) + "\n")
86
+ f_output_csv.write("%s,%s,%s\n\n" % ("Total",str(len(sizes)),str(sum(sizes))))
87
+ else:
88
+ print("No variants found. Plots depicting variant size distributions will also be missing.\n")
89
+
90
+ print("Total number of all variants: %s" % (intWithCommas(overall_total)))
91
+ f_output_csv.write("Total for all variants,%s,%s bp\n" % (overall_total,int(overall_total_bases)))
92
+ print("Total bases affected by all variants: %s" % (gig_meg(int(overall_total_bases))))
93
+
94
+ print("Total number of structural variants: %s" % (intWithCommas(overall_total_SVs)))
95
+ f_output_csv.write("Total for all structural variants,%s,%s bp\n" % (overall_total_SVs,int(overall_total_SV_bases)) )
96
+ print("Total bases affected by structural variants: %s" % (gig_meg(int(overall_total_SV_bases))))
97
+
98
+ f_output_csv.close()
99
+
100
+ def gig_meg(number,digits = 2):
101
+ gig = 1000000000.
102
+ meg = 1000000.
103
+ kil = 1000.
104
+
105
+ if number > gig:
106
+ return str(round(number/gig,digits)) + " Gbp"
107
+ elif number > meg:
108
+ return str(round(number/meg,digits)) + " Mbp"
109
+ elif number > kil:
110
+ return str(round(number/kil,digits)) + " Kbp"
111
+ else:
112
+ return str(number) + " bp"
113
+
114
+
115
+ def intWithCommas(x):
116
+ if type(x) != int:
117
+ raise TypeError("Parameter must be an integer.")
118
+ if x < 0:
119
+ return '-' + intWithCommas(-x)
120
+ result = ''
121
+ while x >= 1000:
122
+ x, r = divmod(x, 1000)
123
+ result = ",%03d%s" % (r, result)
124
+ return "%d%s" % (x, result)
125
+
126
+ def main():
127
+ parser=argparse.ArgumentParser(description='Output a summary table of variants from Assemblytics',formatter_class=argparse.ArgumentDefaultsHelpFormatter)
128
+ parser.add_argument('-i',help='bed file of variants from Assemblytics',dest='file',type=str,required=True)
129
+ parser.add_argument('-min',help='minimum variant size',dest='minimum_variant_size',type=int,required=True)
130
+ parser.add_argument('-max',help='maximum variant size',dest='maximum_variant_size',type=int,required=True)
131
+
132
+ args=parser.parse_args()
133
+ SVtable(args)
134
+
135
+ if __name__=="__main__":
136
+ main()
137
+
138
+
139
+
140
+
141
+
142
+
143
+
144
+
145
+
146
+
147
+