assemblytics 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ __version__ = "2.0.0"
assemblytics/cli.py ADDED
@@ -0,0 +1,211 @@
1
+ #!/usr/bin/env python3
2
+
3
+ """Python orchestrator for the Assemblytics pipeline."""
4
+
5
+ import argparse
6
+ import io
7
+ import os
8
+ import sys
9
+ import zipfile
10
+
11
+ from .dot_prep import run as run_dot_prep
12
+ from .dotplot import run as run_dotplot
13
+ from .index import run as run_index
14
+ from .nchart import run as run_nchart
15
+ from .summary import SVtable as run_summary
16
+ from .uniq_anchor import run as run_uniq_anchor
17
+ from .variant_charts import run as run_variant_charts
18
+ from .variants import run as run_variants
19
+
20
+
21
+ USAGE = "assemblytics -d delta -o output_dir -l unique_length -min min_size -max max_size"
22
+
23
+
24
+ def log_progress(log_file, message):
25
+ with open(log_file, "a") as log:
26
+ log.write(message + "\n")
27
+
28
+
29
+ def fail(log_file, step, message, exit_code=1):
30
+ log_progress(log_file, step)
31
+ sys.exit(exit_code)
32
+
33
+
34
+ def zip_results(output_dir):
35
+ zip_path = os.path.join(output_dir, "assemblytics_results.zip")
36
+ zip_filename = os.path.basename(zip_path)
37
+ with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as archive:
38
+ for filename in os.listdir(output_dir):
39
+ if filename.startswith("assemblytics_") and filename != zip_filename:
40
+ archive.write(os.path.join(output_dir, filename), filename)
41
+
42
+
43
+ def run_summary_to_file(output_dir, minimum_size, maximum_size):
44
+ summary_path = os.path.join(output_dir, "assemblytics_structural_variants_summary.txt")
45
+ bed_path = os.path.join(output_dir, "assemblytics_structural_variants.bed")
46
+ summary_args = argparse.Namespace(
47
+ file=bed_path,
48
+ minimum_variant_size=minimum_size,
49
+ maximum_variant_size=maximum_size,
50
+ )
51
+ buffer = io.StringIO()
52
+ stdout = sys.stdout
53
+ sys.stdout = buffer
54
+ try:
55
+ run_summary(summary_args)
56
+ finally:
57
+ sys.stdout = stdout
58
+ with open(summary_path, "w") as summary:
59
+ summary.write(buffer.getvalue())
60
+
61
+
62
+ def run(args):
63
+ delta = args.delta
64
+ output_dir = args.output_dir
65
+ unique_length = args.unique_length
66
+ minimum_size = args.minimum_size
67
+ maximum_size = args.maximum_size
68
+ long_range = getattr(args, "long_range", False)
69
+
70
+ print("Input delta file:", delta)
71
+ print("Output directory:", output_dir)
72
+ print("Unique anchor length:", unique_length)
73
+ print("Minimum variant size to call:", minimum_size)
74
+ print("Maximum variant size to call:", maximum_size)
75
+
76
+ os.makedirs(output_dir, exist_ok=True)
77
+
78
+ log_file = os.path.join(output_dir, "assemblytics_progress.log")
79
+ print("Logging progress updates in", log_file)
80
+
81
+ log_progress(log_file, "STARTING,DONE,Starting unique anchor filtering.")
82
+
83
+ print("1. Filter delta file")
84
+ run_uniq_anchor(
85
+ argparse.Namespace(
86
+ delta=delta,
87
+ out=output_dir,
88
+ unique_length=unique_length,
89
+ keep_small_uniques=True,
90
+ )
91
+ )
92
+ print("FILE_READY:assemblytics_assembly_stats.txt")
93
+ print("FILE_READY:assemblytics_coords.tab")
94
+ print("FILE_READY:assemblytics_coords.csv")
95
+
96
+ filtered_delta = os.path.join(output_dir, "assemblytics_unique_length_filtered_l{}.delta.gz".format(unique_length))
97
+ if not os.path.exists(filtered_delta):
98
+ fail(
99
+ log_file,
100
+ "UNIQFILTER,FAIL,Step 1: uniq_anchor.py failed: "
101
+ "Possible problem with Python or Python packages on server.",
102
+ )
103
+ print("FILE_READY:" + os.path.basename(filtered_delta))
104
+
105
+ log_progress(
106
+ log_file,
107
+ "UNIQFILTER,DONE,Step 1: uniq_anchor.py completed successfully. "
108
+ "Now finding variants between alignments.",
109
+ )
110
+
111
+ print("2. Finding structural variants")
112
+ combined_path = os.path.join(output_dir, "assemblytics_structural_variants.bed")
113
+ long_range_path = os.path.join(output_dir, "assemblytics_long_range_variants.bed") if long_range else None
114
+ run_variants(filtered_delta, minimum_size, maximum_size, minimum_size, combined_path, long_range_path)
115
+ if not os.path.exists(combined_path):
116
+ fail(
117
+ log_file,
118
+ "VARIANTS,FAIL,Step 2: variants.py failed: "
119
+ "Possible problem with Python on server.",
120
+ )
121
+ print("FILE_READY:" + os.path.basename(combined_path))
122
+ if long_range:
123
+ print("FILE_READY:" + os.path.basename(long_range_path))
124
+
125
+ log_progress(
126
+ log_file,
127
+ "VARIANTS,DONE,Step 2: variants.py completed successfully. "
128
+ "Now generating figures and summary statistics.",
129
+ )
130
+
131
+ print("3. Index coordinates and generate summary statistics")
132
+ run_index(
133
+ argparse.Namespace(
134
+ coords=os.path.join(output_dir, "assemblytics_coords.csv"),
135
+ out=output_dir,
136
+ )
137
+ )
138
+ run_summary_to_file(output_dir, minimum_size, maximum_size)
139
+ print("FILE_READY:assemblytics_structural_variants_summary.txt")
140
+
141
+ print("4. Generating figures")
142
+ run_variant_charts(output_dir, minimum_size, maximum_size)
143
+ # Charts are ready incrementally too
144
+ charts = [f for f in os.listdir(output_dir) if f.startswith("assemblytics_size_distributions") and f.endswith(".png")]
145
+ for chart in charts:
146
+ print("FILE_READY:" + chart)
147
+
148
+ run_dotplot(output_dir)
149
+ print("FILE_READY:assemblytics_dotplot_filtered.png")
150
+
151
+ run_nchart(output_dir)
152
+ print("FILE_READY:assemblytics_nchart.png")
153
+
154
+ print("5. Preparing interactive Dot plot")
155
+ dot_prefix = os.path.join(output_dir, "assemblytics_dot")
156
+ run_dot_prep(
157
+ argparse.Namespace(
158
+ delta=delta,
159
+ out=dot_prefix,
160
+ unique_length=unique_length,
161
+ overview=1000,
162
+ ),
163
+ write_delta=False,
164
+ )
165
+ print("FILE_READY:assemblytics_dot.coords")
166
+ print("FILE_READY:assemblytics_dot.coords.idx")
167
+
168
+ zip_results(output_dir)
169
+ print("FILE_READY:assemblytics_results.zip")
170
+
171
+ summary_path = os.path.join(output_dir, "assemblytics_structural_variants_summary.txt")
172
+ with open(summary_path) as summary:
173
+ if "Total" not in summary.read():
174
+ fail(log_file, "SUMMARY,FAIL,Step 3: summary.py failed")
175
+
176
+ log_progress(
177
+ log_file,
178
+ "SUMMARY,DONE,Step 3: summary.py completed successfully",
179
+ )
180
+
181
+
182
+ def main():
183
+ parser = argparse.ArgumentParser(
184
+ description="Assemblytics structural variant detection pipeline",
185
+ usage=USAGE,
186
+ )
187
+ parser.add_argument("-d", "--delta", help="MUMmer delta file (.delta or .delta.gz)", required=True)
188
+ parser.add_argument("-o", "--output_dir", help="Output directory for assemblytics_* result files (default: current directory)", default=".")
189
+ parser.add_argument("-l", "--unique_length", type=int, default=10000, help="Unique anchor length requirement (default: 10000)")
190
+ parser.add_argument("-min", "--minimum_size", type=int, default=50, help="Minimum variant size to call (default: 50)")
191
+ parser.add_argument("-max", "--maximum_size", type=int, default=10000, help="Maximum variant size to call (default: 10000)")
192
+ parser.add_argument(
193
+ "--long-range",
194
+ dest="long_range",
195
+ action="store_true",
196
+ help=(
197
+ "Also report long-range and inter-chromosomal candidate variants (events bigger "
198
+ "than --maximum_size, or spanning two different reference chromosomes) to a "
199
+ "separate assemblytics_long_range_variants.bed file. These are usually caused by "
200
+ "misassemblies, but can also represent real translocations or other large-scale "
201
+ "rearrangements, so they're kept out of the main results by default and require "
202
+ "manual review."
203
+ ),
204
+ )
205
+ parser.set_defaults(func=run)
206
+ args = parser.parse_args()
207
+ args.func(args)
208
+
209
+
210
+ if __name__ == "__main__":
211
+ main()
@@ -0,0 +1,430 @@
1
+ #! /usr/bin/env python
2
+
3
+ # Author: Maria Nattestad
4
+ # Email: maria.nattestad@gmail.com
5
+
6
+ # This script prepares a nucmer output delta file for visualization in Dot
7
+ # Parts of this code is adapted from Assemblytics unique anchor filtering
8
+
9
+
10
+ import argparse
11
+ import gzip
12
+ import time
13
+ import numpy as np
14
+ import operator
15
+ import re
16
+
17
+ def run(args, write_delta=True):
18
+ filename = args.delta
19
+ unique_length = args.unique_length
20
+ output_filename = args.out
21
+ keep_small_uniques = True
22
+ max_overview_alignments = getattr(args, 'overview', 1000)
23
+
24
+ header_lines_by_query, lines_by_query = getQueryRefCombinations(filename)
25
+ unique_alignments = calculateUniqueness(header_lines_by_query, lines_by_query, unique_length, keep_small_uniques)
26
+ reference_lengths, fields_by_query = writeFilteredDeltaFile(filename, output_filename, unique_alignments, unique_length, header_lines_by_query, write_delta=write_delta)
27
+ index_for_dot(reference_lengths, fields_by_query, output_filename, max_overview_alignments)
28
+
29
+
30
+ def scrub(string):
31
+ return string.replace(",","_").replace("!","_").replace("~","_").replace("#", "_")
32
+
33
+
34
+ def getQueryRefCombinations(filename):
35
+ try:
36
+ f = gzip.open(filename, 'rt')
37
+ f.readline()
38
+ except:
39
+ f = open(filename, 'r')
40
+ f.readline()
41
+
42
+ # Ignore the first two lines
43
+ f.readline()
44
+
45
+ linecounter = 0
46
+
47
+ current_query_name = ""
48
+ current_header = ""
49
+
50
+ lines_by_query = {}
51
+ header_lines_by_query = {}
52
+
53
+ before = time.time()
54
+
55
+ for line in f:
56
+ if line[0]==">":
57
+ linecounter += 1
58
+ current_header = line.strip()
59
+ current_query_name = scrub(current_header.split()[1])
60
+
61
+ if header_lines_by_query.get(current_query_name, None) == None:
62
+ lines_by_query[current_query_name] = []
63
+ header_lines_by_query[current_query_name] = []
64
+ else:
65
+ fields = line.strip().split()
66
+ if len(fields) > 4:
67
+ # sometimes start and end are the other way around, but for this they need to be in order
68
+ query_min = min([int(fields[2]),int(fields[3])])
69
+ query_max = max([int(fields[2]),int(fields[3])])
70
+ lines_by_query[current_query_name].append((query_min,query_max))
71
+ header_lines_by_query[current_query_name].append(current_header)
72
+
73
+ f.close()
74
+
75
+ print("First read through the file: %d seconds for %d query-reference combinations" % (time.time()-before,linecounter))
76
+
77
+ return (header_lines_by_query, lines_by_query)
78
+
79
+ def calculateUniqueness(header_lines_by_query, lines_by_query, unique_length, keep_small_uniques):
80
+ before = time.time()
81
+ unique_alignments = {}
82
+ num_queries = len(lines_by_query)
83
+ print("Filtering alignments of %d queries" % (num_queries))
84
+
85
+ num_query_step_to_report = num_queries/100
86
+ if num_queries < 100:
87
+ num_query_step_to_report = num_queries/10
88
+ if num_queries < 10:
89
+ num_query_step_to_report = 1
90
+
91
+ query_counter = 0
92
+
93
+ for query in lines_by_query:
94
+ unique_alignments[query] = summarize_planesweep(lines_by_query[query], unique_length_required = unique_length, keep_small_uniques = keep_small_uniques)
95
+ query_counter += 1
96
+ if (query_counter % num_query_step_to_report) == 0:
97
+ print("Progress: %d%%" % (query_counter*100/num_queries))
98
+
99
+ print("Progress: 100%")
100
+
101
+ print("Deciding which alignments to keep: %d seconds for %d queries" % (time.time()-before,num_queries))
102
+
103
+ return unique_alignments
104
+
105
+
106
+ def summarize_planesweep(lines,unique_length_required, keep_small_uniques=False):
107
+
108
+ unique_alignments = []
109
+
110
+ # If no alignments:
111
+ if len(lines)==0:
112
+ return []
113
+
114
+ # If only one alignment:
115
+ if len(lines) == 1:
116
+ if keep_small_uniques == True or abs(lines[0][1] - lines[0][0]) >= unique_length_required:
117
+ return [0]
118
+ else:
119
+ return []
120
+
121
+ starts_and_stops = []
122
+ for query_min,query_max in lines:
123
+ starts_and_stops.append((query_min,"start"))
124
+ starts_and_stops.append((query_max,"stop"))
125
+
126
+
127
+ sorted_starts_and_stops = sorted(starts_and_stops,key=operator.itemgetter(0))
128
+
129
+ current_coverage = 0
130
+ last_position = -1
131
+ sorted_unique_intervals_left = []
132
+ sorted_unique_intervals_right = []
133
+ for pos,change in sorted_starts_and_stops:
134
+ if current_coverage == 1:
135
+ sorted_unique_intervals_left.append(last_position)
136
+ sorted_unique_intervals_right.append(pos)
137
+
138
+ if change == "start":
139
+ current_coverage += 1
140
+ else:
141
+ current_coverage -= 1
142
+ last_position = pos
143
+
144
+
145
+ linecounter = 0
146
+ for query_min,query_max in lines:
147
+
148
+ i = binary_search(query_min,sorted_unique_intervals_left,0,len(sorted_unique_intervals_left))
149
+
150
+ exact_match = False
151
+ if sorted_unique_intervals_left[i] == query_min and sorted_unique_intervals_right[i] == query_max:
152
+ exact_match = True
153
+ sum_uniq = 0
154
+ while i < len(sorted_unique_intervals_left) and sorted_unique_intervals_left[i] >= query_min and sorted_unique_intervals_right[i] <= query_max:
155
+ sum_uniq += sorted_unique_intervals_right[i] - sorted_unique_intervals_left[i]
156
+ i += 1
157
+
158
+ if sum_uniq >= unique_length_required:
159
+ unique_alignments.append(linecounter)
160
+ elif keep_small_uniques == True and exact_match == True:
161
+ unique_alignments.append(linecounter)
162
+
163
+ linecounter += 1
164
+
165
+ return unique_alignments
166
+
167
+
168
+
169
+ def binary_search(query, numbers, left, right):
170
+ # Returns index of the matching element or the first element to the right
171
+
172
+ if left >= right:
173
+ return right
174
+ mid = int((right+left)/2)
175
+
176
+
177
+ if query == numbers[mid]:
178
+ return mid
179
+ elif query < numbers[mid]:
180
+ return binary_search(query,numbers,left,mid)
181
+ else: # if query > numbers[mid]:
182
+ return binary_search(query,numbers,mid+1,right)
183
+
184
+
185
+ def natural_key(string_):
186
+ """See http://www.codinghorror.com/blog/archives/001018.html"""
187
+ return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)]
188
+
189
+ def writeFilteredDeltaFile(filename, output_filename, unique_alignments, unique_length, header_lines_by_query, write_delta=True):
190
+ before = time.time()
191
+
192
+ try:
193
+ f = gzip.open(filename, 'rt')
194
+ header1 = f.readline()
195
+ except:
196
+ f = open(filename, 'r')
197
+ header1 = f.readline()
198
+
199
+ if write_delta:
200
+ f_out_delta = gzip.open(output_filename + ".uniqueAnchorFiltered_l%d.delta.gz" % (unique_length),'wt')
201
+ f_out_delta.write(header1)
202
+ f_out_delta.write(f.readline())
203
+ else:
204
+ f.readline()
205
+
206
+ linecounter = 0
207
+
208
+ # For filtered delta file:
209
+ list_of_unique_alignments = []
210
+ alignment_counter = {}
211
+ keep_printing = False
212
+
213
+ # For coords:
214
+ current_query_name = ""
215
+ current_query_position = 0
216
+
217
+ # For basic assembly stats:
218
+ ref_sequences = set()
219
+ query_sequences = set()
220
+ reference_lengths = []
221
+ query_lengths = {}
222
+ fields_by_query = {}
223
+
224
+
225
+ for line in f:
226
+ linecounter += 1
227
+ if line[0]==">":
228
+ fields = line.strip().split()
229
+
230
+ query = scrub(fields[1])
231
+ list_of_unique_alignments = unique_alignments[query]
232
+
233
+ if write_delta:
234
+ header_needed = any(
235
+ line.strip() == header_lines_by_query[query][index]
236
+ for index in list_of_unique_alignments
237
+ )
238
+ if header_needed:
239
+ f_out_delta.write(line)
240
+ alignment_counter[query] = alignment_counter.get(query, 0)
241
+
242
+ current_reference_name = scrub(fields[0][1:])
243
+ current_query_name = scrub(fields[1])
244
+ current_reference_size = int(fields[2])
245
+ current_query_size = int(fields[3])
246
+
247
+ if not current_reference_name in ref_sequences:
248
+ reference_lengths.append((current_reference_name, current_reference_size))
249
+ ref_sequences.add(current_reference_name)
250
+ if not current_query_name in query_sequences:
251
+ query_lengths[current_query_name] = current_query_size
252
+ query_sequences.add(current_query_name)
253
+
254
+ else:
255
+ fields = line.strip().split()
256
+ if len(fields) > 4:
257
+ ref_start = int(fields[0])
258
+ ref_end = int(fields[1])
259
+ query_start = int(fields[2])
260
+ query_end = int(fields[3])
261
+ csv_tag = "repetitive"
262
+ if alignment_counter[query] in list_of_unique_alignments:
263
+ if write_delta:
264
+ f_out_delta.write(line)
265
+ csv_tag = "unique"
266
+ keep_printing = True
267
+ else:
268
+ keep_printing = False
269
+ fields = [ref_start, ref_end, query_start, query_end, current_reference_size, current_query_size, current_reference_name, current_query_name, csv_tag]
270
+ if fields_by_query.get(current_query_name, None) is None:
271
+ fields_by_query[current_query_name] = []
272
+ fields_by_query[current_query_name].append(fields)
273
+ alignment_counter[query] = alignment_counter[query] + 1
274
+
275
+ elif keep_printing and write_delta:
276
+ f_out_delta.write(line)
277
+
278
+ f.close()
279
+ if write_delta:
280
+ f_out_delta.close()
281
+
282
+ return reference_lengths, fields_by_query
283
+
284
+ def index_for_dot(reference_lengths, fields_by_query, output_prefix, max_overview_alignments):
285
+
286
+ # Find the order of the reference chromosomes
287
+ reference_lengths.sort(key=lambda x: natural_key(x[0]))
288
+
289
+ # Find the cumulative sums
290
+ cumulative_sum = 0
291
+ ref_chrom_offsets = {}
292
+ queries_by_reference = {}
293
+ for ref,ref_length in reference_lengths:
294
+ ref_chrom_offsets[ref] = cumulative_sum
295
+ cumulative_sum += ref_length
296
+ queries_by_reference[ref] = set()
297
+
298
+ # Calculate relative positions of each alignment in this cumulative length, and take the median of these for each query, then sort the queries by those scores
299
+ flip_by_query = {}
300
+ unique_references_by_query = {} # for index, only unique alignments
301
+ all_references_by_query = {} # for index, including repetitive alignments
302
+ relative_ref_position_by_query = [] # for ordering
303
+
304
+
305
+ ordered_tags = ["unique", "repetitive"]
306
+
307
+
308
+ f_out_coords = open(output_prefix + ".coords", 'w')
309
+ f_out_coords.write("ref_start,ref_end,query_start,query_end,ref\n")
310
+
311
+ query_byte_positions = {}
312
+ query_lengths = {}
313
+
314
+ all_alignments = []
315
+ last_query = ""
316
+
317
+ for query_name in fields_by_query:
318
+
319
+ lines = fields_by_query[query_name]
320
+ sum_forward = 0
321
+ sum_reverse = 0
322
+ ref_position_scores = []
323
+ unique_references_by_query[query_name] = set()
324
+ all_references_by_query[query_name] = set()
325
+
326
+ for fields in lines:
327
+ tag = fields[8]
328
+
329
+ query_name = fields[7]
330
+ query_lengths[query_name] = int(fields[5])
331
+
332
+ all_references_by_query[query_name].add(ref)
333
+ # Only use unique alignments to decide contig orientation
334
+ if tag == "unique":
335
+ query_stop = int(fields[3])
336
+ query_start = int(fields[2])
337
+ ref_start = int(fields[0])
338
+ ref_stop = int(fields[1])
339
+ alignment_length = abs(int(fields[3])-int(fields[2]))
340
+ ref = fields[6]
341
+
342
+ # for index:
343
+ unique_references_by_query[query_name].add(ref)
344
+ queries_by_reference[ref].add(query_name)
345
+
346
+ # for ordering:
347
+ ref_position_scores.append(ref_chrom_offsets[ref] + (ref_start+ref_stop)/2)
348
+
349
+ # for orientation:
350
+ if query_stop < query_start:
351
+ sum_reverse += alignment_length
352
+ else:
353
+ sum_forward += alignment_length
354
+
355
+ # orientation:
356
+ flip = sum_reverse > sum_forward
357
+ flip_by_query[query_name] = "-" if (flip == True) else "+"
358
+
359
+
360
+ for tag in ordered_tags:
361
+ query_byte_positions[(last_query, "end")] = f_out_coords.tell()
362
+ query_byte_positions[(query_name, tag)] = f_out_coords.tell()
363
+ f_out_coords.write("!" + query_name + "!" + tag +"\n")
364
+
365
+ for fields in lines:
366
+ if fields[8] == tag:
367
+ if flip == True:
368
+ fields[2] = int(fields[5]) - int(fields[2])
369
+ fields[3] = int(fields[5]) - int(fields[3])
370
+
371
+ output_fields = [fields[0], fields[1], fields[2], fields[3], fields[6]]
372
+ f_out_coords.write(",".join([str(i) for i in output_fields]) + "\n")
373
+
374
+ # For alignment overview:
375
+ alignment_length = abs(int(fields[3])-int(fields[2]))
376
+ all_alignments.append(([fields[0], fields[1], fields[2], fields[3], fields[6], fields[7], fields[8]], alignment_length))
377
+
378
+ # ordering
379
+ if len(ref_position_scores) > 0:
380
+ relative_ref_position_by_query.append((query_name,np.median(ref_position_scores)))
381
+ else:
382
+ relative_ref_position_by_query.append((query_name,0))
383
+
384
+ last_query = query_name
385
+
386
+
387
+ query_byte_positions[(last_query, "end")] = f_out_coords.tell()
388
+
389
+ relative_ref_position_by_query.sort(key=lambda x: x[1])
390
+
391
+ f_out_index = open(output_prefix + ".coords.idx", 'w')
392
+
393
+ f_out_index.write("#ref\n")
394
+ f_out_index.write("ref,ref_length,matching_queries\n")
395
+ # reference_lengths is sorted by the reference chromosome name
396
+ for ref,ref_length in reference_lengths:
397
+ f_out_index.write("%s,%d,%s\n" % (ref,ref_length,"~".join(queries_by_reference[ref])))
398
+
399
+ f_out_index.write("#query\n")
400
+ f_out_index.write("query,query_length,orientation,bytePosition_unique,bytePosition_repetitive,bytePosition_end,unique_matching_refs,matching_refs\n")
401
+ # relative_ref_position_by_query is sorted by rel_pos
402
+ for query,rel_pos in relative_ref_position_by_query:
403
+ f_out_index.write("%s,%d,%s,%d,%d,%d,%s,%s\n" % (query, query_lengths[query], flip_by_query[query], query_byte_positions[(query,"unique")], query_byte_positions[(query,"repetitive")] - query_byte_positions[(query,"unique")], query_byte_positions[(query,"end")] - query_byte_positions[(query,"repetitive")], "~".join(unique_references_by_query[query]), "~".join(all_references_by_query[query])))
404
+
405
+ f_out_index.write("#overview\n")
406
+ f_out_index.write("ref_start,ref_end,query_start,query_end,ref,query,tag\n")
407
+
408
+ num_overview_alignments = min(max_overview_alignments,len(all_alignments))
409
+ if num_overview_alignments < len(all_alignments):
410
+ print("Included the longest " + str(max_overview_alignments) + " alignments in the index under #overview (change this with the --overview parameter), out of a total of " + str(len(all_alignments)) + " alignments.")
411
+
412
+ all_alignments.sort(key=lambda x: -x[1])
413
+ overview_alignments = all_alignments[0:num_overview_alignments]
414
+ for tup in overview_alignments:
415
+ f_out_index.write(",".join([str(i) for i in tup[0]]) + "\n")
416
+
417
+ f_out_index.close()
418
+
419
+ def main():
420
+ parser=argparse.ArgumentParser(description="Take a delta file, apply Assemblytics unique anchor filtering, and prepare coordinates input files for Dot")
421
+ parser.add_argument("--delta",help="delta file" ,dest="delta", type=str, required=True)
422
+ parser.add_argument("--out",help="output file" ,dest="out", type=str, default="output")
423
+ parser.add_argument("--unique-length",help="The total length of unique sequence an alignment must have on the query side to be retained. Default: 10000" ,dest="unique_length",type=int, default=10000)
424
+ parser.add_argument("--overview",help="The number of alignments to include in the coords.idx output file, which will be shown in the overview for Dot. Default: 1000" ,dest="overview",type=int, default=1000)
425
+ parser.set_defaults(func=run)
426
+ args=parser.parse_args()
427
+ args.func(args)
428
+
429
+ if __name__=="__main__":
430
+ main()