assemblytics 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,357 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import argparse
4
+ import gzip
5
+ import os
6
+ import time
7
+ import numpy as np
8
+ import operator
9
+
10
+
11
+ def run(args):
12
+ filename = args.delta
13
+ unique_length = args.unique_length
14
+ output_dir = args.out
15
+ keep_small_uniques = args.keep_small_uniques
16
+ if output_dir:
17
+ os.makedirs(output_dir, exist_ok=True)
18
+ # if keep_small_uniques:
19
+ # print("Keeping fully unique alignments even if they are below the unique anchor length of", unique_length, "bp")
20
+ # else:
21
+ # print("Discarding all alignments below the unique anchor length of", unique_length, "bp")
22
+ # print("Use --keep-small-uniques to keep all the fully unique alignments even below this length")
23
+ # if unique_length == 10000:
24
+ # print("Use --unique-length X to set the unique anchor length requirement. Default is 10000, such that each alignment must have at least 10000 bp from the query that are not included in any other alignments.")
25
+
26
+ try:
27
+ f = gzip.open(filename, 'rt')
28
+ header1 = f.readline().strip()
29
+ # Detected gzipped delta file.
30
+ except:
31
+ f = open(filename, 'r')
32
+ header1 = f.readline().strip()
33
+ # Detected uncompressed delta file.
34
+
35
+ # Skip the second line
36
+ f.readline()
37
+
38
+ linecounter = 0
39
+
40
+ current_query_name = ""
41
+ current_header = ""
42
+
43
+ lines_by_query = {}
44
+ header_lines_by_query = {}
45
+
46
+ before = time.time()
47
+ last = before
48
+
49
+ existing_query_names = set()
50
+
51
+ for line in f:
52
+ if line[0]==">":
53
+ fields = line.strip().split()
54
+ current_query_name = fields[1]
55
+ current_header = line.strip()
56
+ if current_query_name not in existing_query_names:
57
+ lines_by_query[current_query_name] = []
58
+ header_lines_by_query[current_query_name] = []
59
+ existing_query_names.add(current_query_name)
60
+ else:
61
+ fields = line.strip().split()
62
+ if len(fields) > 4:
63
+ # sometimes start and end are the other way around, but for this they need to be in order
64
+ query_min = min([int(fields[2]),int(fields[3])])
65
+ query_max = max([int(fields[2]),int(fields[3])])
66
+
67
+ lines_by_query[current_query_name].append((query_min,query_max))
68
+ header_lines_by_query[current_query_name].append(current_header)
69
+
70
+ f.close()
71
+
72
+
73
+ before = time.time()
74
+ alignments_to_keep = {}
75
+ num_queries = len(lines_by_query)
76
+
77
+ num_query_step_to_report = int(num_queries/100)
78
+ if num_queries < 100:
79
+ num_query_step_to_report = int(num_queries/10)
80
+ if num_queries < 10:
81
+ num_query_step_to_report = 1
82
+
83
+ query_counter = 0
84
+
85
+ for query in lines_by_query:
86
+ alignments_to_keep[query] = summarize_planesweep(lines_by_query[query], unique_length_required = unique_length,keep_small_uniques=keep_small_uniques)
87
+
88
+ query_counter += 1
89
+ before = time.time()
90
+
91
+ fout = gzip.open(os.path.join(output_dir, "assemblytics_unique_length_filtered_l%d.delta.gz" % (unique_length)),'wt')
92
+
93
+ try:
94
+ f = gzip.open(filename, 'rt')
95
+ header1 = f.readline()
96
+ # Detected gzipped delta file.
97
+ except:
98
+ f = open(filename, 'r')
99
+ header1 = f.readline()
100
+ # Detected uncompressed delta file.
101
+
102
+ fout.write(header1)
103
+ fout.write(f.readline())
104
+
105
+ linecounter = 0
106
+
107
+ # For filtered delta file:
108
+ list_of_alignments_to_keep = []
109
+ alignment_counter = {}
110
+ keep_printing = False
111
+
112
+ # For coords:
113
+ current_query_name = ""
114
+ current_query_position = 0
115
+ fcoords_out_tab = open(os.path.join(output_dir, "assemblytics_coords.tab"),'w')
116
+ fcoords_out_csv = open(os.path.join(output_dir, "assemblytics_coords.csv"),'w')
117
+ fcoords_out_csv.write("ref_start,ref_end,query_start,query_end,ref_length,query_length,ref,query,tag\n")
118
+
119
+
120
+ # For basic assembly stats:
121
+ ref_sequences = set()
122
+ query_sequences = set()
123
+ ref_lengths = []
124
+ query_lengths = []
125
+
126
+ # For genome length files (only sequences with at least one unique alignment,
127
+ # matching what ends up in coords.tab)
128
+ unique_ref_entries = {}
129
+ unique_query_entries = {}
130
+
131
+ f_stats_out = open(os.path.join(output_dir, "assemblytics_assembly_stats.txt"),"w")
132
+
133
+ for line in f:
134
+ linecounter += 1
135
+ if line[0]==">":
136
+ fields = line.strip().split()
137
+
138
+ # For delta file output:
139
+ query = fields[1]
140
+ list_of_alignments_to_keep = alignments_to_keep[query]
141
+
142
+ header_needed = False
143
+ for index in list_of_alignments_to_keep:
144
+ if line.strip() == header_lines_by_query[query][index]:
145
+ header_needed = True
146
+ if header_needed == True:
147
+ fout.write(line) # if we have any alignments under this header, print the header
148
+ alignment_counter[query] = alignment_counter.get(query,0)
149
+
150
+ # For coords:
151
+ current_reference_name = fields[0][1:]
152
+ current_query_name = fields[1]
153
+
154
+ current_reference_size = int(fields[2])
155
+ current_query_size = int(fields[3])
156
+
157
+ # For basic assembly stats:
158
+ if not current_reference_name in ref_sequences:
159
+ ref_lengths.append(current_reference_size)
160
+ ref_sequences.add(current_reference_name)
161
+ if not current_query_name in query_sequences:
162
+ query_lengths.append(current_query_size)
163
+ query_sequences.add(current_query_name)
164
+
165
+ else:
166
+ fields = line.strip().split()
167
+ if len(fields) > 4:
168
+ # For coords:
169
+ ref_start = int(fields[0])
170
+ ref_end = int(fields[1])
171
+ query_start = int(fields[2])
172
+ query_end = int(fields[3])
173
+ csv_tag = "repetitive"
174
+ if alignment_counter[query] in list_of_alignments_to_keep:
175
+ fout.write(line)
176
+ fcoords_out_tab.write("\t".join(map(str,[ref_start,ref_end,query_start, query_end,current_reference_size,current_query_size,current_reference_name,current_query_name])) + "\n")
177
+ unique_ref_entries[current_reference_name] = current_reference_size
178
+ unique_query_entries[current_query_name] = current_query_size
179
+ csv_tag = "unique"
180
+ keep_printing = True
181
+ else:
182
+ keep_printing = False
183
+ fcoords_out_csv.write(",".join(map(str,[ref_start,ref_end,query_start, query_end,current_reference_size,current_query_size,current_reference_name.replace(",","_"),current_query_name.replace(",","_"),csv_tag])) + "\n")
184
+ alignment_counter[query] = alignment_counter[query] + 1
185
+
186
+ elif keep_printing == True:
187
+ fout.write(line)
188
+
189
+ fcoords_out_tab.close()
190
+ fcoords_out_csv.close()
191
+
192
+ with open(os.path.join(output_dir, "assemblytics_ref.genome"), "w") as ref_genome_out:
193
+ for name, length in sorted(unique_ref_entries.items(), key=lambda item: item[1], reverse=True):
194
+ ref_genome_out.write("%s\t%d\n" % (name, length))
195
+
196
+ with open(os.path.join(output_dir, "assemblytics_query.genome"), "w") as query_genome_out:
197
+ for name, length in sorted(unique_query_entries.items(), key=lambda item: item[1], reverse=True):
198
+ query_genome_out.write("%s\t%d\n" % (name, length))
199
+
200
+ print("Reading file and recording all the entries we decided to keep: %d seconds for %d total lines in file" % (time.time()-before,linecounter))
201
+
202
+ ref_lengths.sort()
203
+ query_lengths.sort()
204
+
205
+ # Assembly statistics
206
+ ref_lengths = np.array(ref_lengths)
207
+ query_lengths = np.array(query_lengths)
208
+
209
+ f_stats_out.write("Reference: %s\n" % (header1.split()[0].split("/")[-1]))
210
+ f_stats_out.write( "Number of sequences: %s\n" % intWithCommas(len(ref_lengths)))
211
+ f_stats_out.write( "Total sequence length: %s\n" % gig_meg(sum(ref_lengths)))
212
+ f_stats_out.write( "Mean: %s\n" % gig_meg(np.mean(ref_lengths)))
213
+ f_stats_out.write( "Min: %s\n" % gig_meg(np.min(ref_lengths)))
214
+ f_stats_out.write( "Max: %s\n" % gig_meg(np.max(ref_lengths)))
215
+ f_stats_out.write( "N50: %s\n" % gig_meg(N50(ref_lengths)))
216
+ f_stats_out.write( "\n\n")
217
+ f_stats_out.write( "Query: %s\n" % header1.split()[1].split("/")[-1])
218
+ f_stats_out.write( "Number of sequences: %s\n" % intWithCommas(len(query_lengths)))
219
+ f_stats_out.write( "Total sequence length: %s\n" % gig_meg(sum(query_lengths)))
220
+ f_stats_out.write( "Mean: %s\n" % gig_meg(np.mean(query_lengths)))
221
+ f_stats_out.write( "Min: %s\n" % gig_meg(np.min(query_lengths)))
222
+ f_stats_out.write( "Max: %s\n" % gig_meg(np.max(query_lengths)))
223
+ f_stats_out.write( "N50: %s\n" % gig_meg(N50(query_lengths)))
224
+
225
+
226
+ f.close()
227
+ fout.close()
228
+ f_stats_out.close()
229
+
230
+ def N50(sorted_list):
231
+ # List should be sorted as increasing
232
+
233
+ # We flip the list around here so we start with the largest element
234
+ cumsum = 0
235
+ for length in sorted_list[::-1]:
236
+ cumsum += length
237
+ if cumsum >= sum(sorted_list)/2:
238
+ return length
239
+
240
+
241
+ def gig_meg(number,digits = 2):
242
+ gig = 1000000000.
243
+ meg = 1000000.
244
+ kil = 1000.
245
+
246
+ if number > gig:
247
+ return str(round(number/gig,digits)) + " Gbp"
248
+ elif number > meg:
249
+ return str(round(number/meg,digits)) + " Mbp"
250
+ elif number > kil:
251
+ return str(round(number/kil,digits)) + " Kbp"
252
+ else:
253
+ return str(number) + " bp"
254
+
255
+ def intWithCommas(x):
256
+ if type(x) != int:
257
+ raise TypeError("Parameter must be an integer.")
258
+ if x < 0:
259
+ return '-' + intWithCommas(-x)
260
+ result = ''
261
+ while x >= 1000:
262
+ x, r = divmod(x, 1000)
263
+ result = ",%03d%s" % (r, result)
264
+ return "%d%s" % (x, result)
265
+
266
+
267
+ def summarize_planesweep(lines,unique_length_required, keep_small_uniques=False):
268
+
269
+ alignments_to_keep = []
270
+
271
+ # If no alignments:
272
+ if len(lines)==0:
273
+ return []
274
+
275
+ # If only one alignment:
276
+ if len(lines) == 1:
277
+ if keep_small_uniques == True or abs(lines[0][1] - lines[0][0]) >= unique_length_required:
278
+ return [0]
279
+ else:
280
+ return []
281
+
282
+ starts_and_stops = []
283
+ for query_min,query_max in lines:
284
+ starts_and_stops.append((query_min,"start"))
285
+ starts_and_stops.append((query_max,"stop"))
286
+
287
+
288
+ sorted_starts_and_stops = sorted(starts_and_stops,key=operator.itemgetter(0))
289
+
290
+ current_coverage = 0
291
+ last_position = -1
292
+ sorted_unique_intervals_left = []
293
+ sorted_unique_intervals_right = []
294
+ for pos,change in sorted_starts_and_stops:
295
+ if current_coverage == 1:
296
+ sorted_unique_intervals_left.append(last_position)
297
+ sorted_unique_intervals_right.append(pos)
298
+
299
+ if change == "start":
300
+ current_coverage += 1
301
+ else:
302
+ current_coverage -= 1
303
+ last_position = pos
304
+
305
+
306
+ linecounter = 0
307
+ for query_min,query_max in lines:
308
+
309
+ i = binary_search(query_min,sorted_unique_intervals_left,0,len(sorted_unique_intervals_left))
310
+
311
+ exact_match = False
312
+ if sorted_unique_intervals_left[i] == query_min and sorted_unique_intervals_right[i] == query_max:
313
+ exact_match = True
314
+ sum_uniq = 0
315
+ while i < len(sorted_unique_intervals_left) and sorted_unique_intervals_left[i] >= query_min and sorted_unique_intervals_right[i] <= query_max:
316
+ sum_uniq += sorted_unique_intervals_right[i] - sorted_unique_intervals_left[i]
317
+ i += 1
318
+
319
+ if sum_uniq >= unique_length_required:
320
+ alignments_to_keep.append(linecounter)
321
+ elif keep_small_uniques == True and exact_match == True:
322
+ alignments_to_keep.append(linecounter)
323
+
324
+ linecounter += 1
325
+
326
+ return alignments_to_keep
327
+
328
+
329
+ def binary_search(query, numbers, left, right):
330
+ # Returns index of the matching element or the first element to the right
331
+
332
+ if left >= right:
333
+ return right
334
+ mid = int((right+left)/2)
335
+
336
+
337
+ if query == numbers[mid]:
338
+ return mid
339
+ elif query < numbers[mid]:
340
+ return binary_search(query,numbers,left,mid)
341
+ else: # if query > numbers[mid]:
342
+ return binary_search(query,numbers,mid+1,right)
343
+
344
+
345
+
346
+ def main():
347
+ parser=argparse.ArgumentParser(description="Filters alignments in delta file based on whether each alignment has a unique sequence anchoring it")
348
+ parser.add_argument("--delta",help="delta file" ,dest="delta", type=str, required=True)
349
+ parser.add_argument("--out",help="output directory for assemblytics_* files (default: current directory)" ,dest="out", type=str, default=".")
350
+ parser.add_argument("--unique-length",help="The total length of unique sequence an alignment must have on the query side to be retained. Default: 10000" ,dest="unique_length",type=int, default=10000)
351
+ parser.add_argument("--keep-small-uniques",help="Keep small aligments (below the unique anchor length) if they are completely unique without any part of the alignment mapping multiple places" ,dest="keep_small_uniques",action="store_true")
352
+ parser.set_defaults(func=run)
353
+ args=parser.parse_args()
354
+ args.func(args)
355
+
356
+ if __name__=="__main__":
357
+ main()
@@ -0,0 +1,204 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import sys
4
+ import pandas as pd
5
+ import matplotlib
6
+ matplotlib.use('Agg')
7
+ import matplotlib.pyplot as plt
8
+ import numpy as np
9
+ import os
10
+ import warnings
11
+ warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib")
12
+
13
+ def comma_format(num):
14
+ return "{:,}".format(int(abs(num)))
15
+
16
+ def run(output_dir, abs_min_var, abs_max_var):
17
+ filename = os.path.join(output_dir, "assemblytics_structural_variants.bed")
18
+ if not os.path.exists(filename):
19
+ print(f"File {filename} not found.")
20
+ return
21
+
22
+ try:
23
+ bed = pd.read_csv(filename, sep="\t")
24
+ except Exception as e:
25
+ print(f"Error reading {filename}: {e}")
26
+ return
27
+
28
+ if bed.empty:
29
+ print("No variants found in BED file.")
30
+ return
31
+
32
+ # Rename columns to match R script expectations
33
+ expected_cols = ["chrom", "start", "stop", "name", "size", "strand", "type", "ref_dist", "query_dist", "contig_position", "method_found"]
34
+ bed.columns = expected_cols[:len(bed.columns)]
35
+
36
+ # Revalue types
37
+ type_map = {
38
+ "Repeat_expansion": "Repeat expansion",
39
+ "Repeat_contraction": "Repeat contraction",
40
+ "Tandem_expansion": "Tandem expansion",
41
+ "Tandem_contraction": "Tandem contraction"
42
+ }
43
+ bed['type'] = bed['type'].replace(type_map)
44
+
45
+ types_allowed = ["Insertion", "Deletion", "Repeat expansion", "Repeat contraction", "Tandem expansion", "Tandem contraction"]
46
+
47
+ # Filter for allowed types and set as categorical for consistent ordering
48
+ bed = bed[bed['type'].isin(types_allowed)]
49
+ bed['type'] = pd.Categorical(bed['type'], categories=types_allowed, ordered=True)
50
+
51
+ # Color palette (Set1 from RColorBrewer: [1,2,3,4,5,7,8])
52
+ # Set1 hex colors: #E41A1C, #377EB8, #4DAF4A, #984EA3, #FF7F00, #A65628
53
+ # R big_palette<-brewer.pal(9,"Set1")[c(1,2,3,4,5,7)] was actually using 7th which is pink.
54
+ # User said Set1[8] in python instead of 7 for brown.
55
+ # Set1 colors: 1:red, 2:blue, 3:green, 4:purple, 5:orange, 6:yellow, 7:brown, 8:pink, 9:grey
56
+ # Actually brewer.pal(9, "Set1") is:
57
+ # 1: #E41A1C (red)
58
+ # 2: #377EB8 (blue)
59
+ # 3: #4DAF4A (green)
60
+ # 4: #984EA3 (purple)
61
+ # 5: #FF7F00 (orange)
62
+ # 6: #FFFF33 (yellow)
63
+ # 7: #A65628 (brown)
64
+ # 8: #F781BF (pink)
65
+ # 9: #999999 (grey)
66
+ # The user says Set1[8] for brown. In R indexing starts at 1.
67
+ # Wait, R brewer.pal(9, "Set1")[7] is brown (#A65628).
68
+ # If the user says R is 1-indexed and they want Set1[8] in python... maybe they meant the 8th color in Set1 is brown?
69
+ # Actually in Set1, 7 is brown and 8 is pink.
70
+ # If the previous code used pink (#F781BF) and the user wants brown, brown is #A65628.
71
+ big_palette = ["#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", "#A65628"]
72
+
73
+ # Prep data for log-scaled plot
74
+ alt = bed.copy()
75
+ contraction_types = ["Deletion", "Repeat contraction", "Tandem contraction"]
76
+ alt.loc[alt['type'].isin(contraction_types), 'size'] = -1 * alt.loc[alt['type'].isin(contraction_types), 'size']
77
+
78
+ alt['Type'] = "None"
79
+ alt.loc[alt['type'].isin(["Insertion", "Deletion"]), 'Type'] = "Indel"
80
+ alt.loc[alt['type'].isin(["Tandem expansion", "Tandem contraction"]), 'Type'] = "Tandem"
81
+ alt.loc[alt['type'].isin(["Repeat expansion", "Repeat contraction"]), 'Type'] = "Repeat"
82
+ # User requested order: Indel, Repeat, Tandem
83
+ alt['Type'] = pd.Categorical(alt['Type'], categories=["Indel", "Repeat", "Tandem"], ordered=True)
84
+
85
+ # Size cutoffs
86
+ var_size_cutoffs = sorted(list(set([abs_min_var, 10, 50, 500, abs_max_var])))
87
+ var_size_cutoffs = [x for x in var_size_cutoffs if x >= abs_min_var and x <= abs_max_var]
88
+
89
+ for i in range(len(var_size_cutoffs) - 1):
90
+ min_var = var_size_cutoffs[i]
91
+ max_var = var_size_cutoffs[i+1]
92
+
93
+ if min_var < abs_max_var and max_var > abs_min_var:
94
+ filtered_bed = bed[(bed['size'] >= min_var) & (bed['size'] <= max_var)]
95
+
96
+ if not filtered_bed.empty:
97
+ binwidth = max(1, (max_var - min_var) / 100)
98
+ bins = np.arange(min_var, max_var + binwidth, binwidth)
99
+
100
+ # Calculate global max for y-axis synchronization
101
+ max_counts = []
102
+ for t in types_allowed:
103
+ data = filtered_bed[filtered_bed['type'] == t]['size']
104
+ if not data.empty:
105
+ counts, _ = np.histogram(data, bins=bins)
106
+ max_counts.append(max(counts))
107
+ global_max = max(max_counts) if max_counts else 10
108
+
109
+ fig, axes = plt.subplots(nrows=len(types_allowed), ncols=1, figsize=(8, 10), sharex=True)
110
+ fig.suptitle(f"Variants {comma_format(min_var)} to {comma_format(max_var)} bp", fontsize=16)
111
+
112
+ for j, t in enumerate(types_allowed):
113
+ ax = axes[j]
114
+ data = filtered_bed[filtered_bed['type'] == t]['size']
115
+ ax.hist(data, bins=bins, color=big_palette[j], label=t)
116
+ ax.set_ylabel("Count", fontsize=8)
117
+ ax.tick_params(axis='both', which='major', labelsize=8)
118
+ ax.set_ylim(0, global_max * 1.1) # Add 10% padding
119
+
120
+ # Remove right and top spines
121
+ ax.spines['right'].set_visible(False)
122
+ ax.spines['top'].set_visible(False)
123
+
124
+ # Add type label inside the plot, moved up to avoid data
125
+ ax.text(0.98, 0.85, t, transform=ax.transAxes, horizontalalignment='right', verticalalignment='top', fontsize=10, fontweight='bold')
126
+
127
+ plt.xlabel("Variant size", fontsize=12)
128
+ plt.tight_layout(rect=[0, 0.03, 1, 0.95])
129
+
130
+ for fmt in ['png', 'pdf']:
131
+ plt.savefig(os.path.join(output_dir, f"assemblytics_size_distributions_{min_var}-{max_var}.{fmt}"), dpi=200)
132
+ plt.close()
133
+ else:
134
+ print(f"No variants in plot: min_var={min_var}, max_var={max_var}")
135
+
136
+ # Log-scaled plot
137
+ fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(10, 8), sharex=True)
138
+ fig.suptitle(f"Variants {comma_format(abs_min_var)} to {comma_format(abs_max_var)} bp", fontsize=16)
139
+
140
+ # User requested order: Indel, Repeat, Tandem
141
+ categories_ordered = ["Indel", "Repeat", "Tandem"]
142
+ types_by_category = {
143
+ "Indel": ["Insertion", "Deletion"],
144
+ "Repeat": ["Repeat expansion", "Repeat contraction"],
145
+ "Tandem": ["Tandem expansion", "Tandem contraction"]
146
+ }
147
+
148
+ binwidth = (2 * abs_max_var) / 100
149
+ bins = np.arange(-abs_max_var, abs_max_var + binwidth, binwidth)
150
+
151
+ # Calculate global max for y-axis synchronization in log scale
152
+ max_counts_log = []
153
+ for category in categories_ordered:
154
+ cat_data = alt[alt['Type'] == category]
155
+ if not cat_data.empty:
156
+ # We want to show counts + 1 to make small counts visible on log scale
157
+ counts, _ = np.histogram(cat_data['size'], bins=bins)
158
+ max_counts_log.append(max(counts) + 1)
159
+ global_max_log = max(max_counts_log) if max_counts_log else 100
160
+
161
+ for i, category in enumerate(categories_ordered):
162
+ ax = axes[i]
163
+ for t in types_by_category[category]:
164
+ color_idx = types_allowed.index(t)
165
+ data = alt[alt['type'] == t]['size']
166
+ if not data.empty:
167
+ # Use np.histogram and plt.bar to manually implement count + 1 for log scale
168
+ counts, bin_edges = np.histogram(data, bins=bins)
169
+ # To match R's log(count + 1), we plot bars of height counts + 1
170
+ # But we need to handle the bottom of the log scale.
171
+ # Actually, a better way to match R exactly is to plot counts + 1 and set ylim bottom to 1.
172
+ bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
173
+ ax.bar(bin_centers, counts + 1, width=binwidth, color=big_palette[color_idx], label=t, alpha=0.7)
174
+
175
+ ax.set_yscale('log')
176
+ ax.set_ylabel("Log(count + 1)", fontsize=10)
177
+ ax.tick_params(axis='both', which='major', labelsize=8)
178
+ ax.spines['right'].set_visible(False)
179
+ ax.spines['top'].set_visible(False)
180
+ ax.set_ylim(1, global_max_log * 1.5)
181
+
182
+ # Add category label
183
+ ax.text(0.02, 0.85, category, transform=ax.transAxes, horizontalalignment='left', fontsize=12, fontweight='bold')
184
+
185
+ handles, labels = ax.get_legend_handles_labels()
186
+ if handles:
187
+ ax.legend(handles, labels, loc='upper right', fontsize=8)
188
+
189
+ plt.xlabel("Variant size", fontsize=12)
190
+ plt.tight_layout(rect=[0, 0.03, 1, 0.95])
191
+
192
+ for fmt in ['png', 'pdf']:
193
+ plt.savefig(os.path.join(output_dir, f"assemblytics_size_distributions_log.{fmt}"), dpi=200)
194
+ plt.close()
195
+
196
+ if __name__ == "__main__":
197
+ if len(sys.argv) < 4:
198
+ print("Usage: variant_charts.py output_dir abs_min_var abs_max_var")
199
+ sys.exit(1)
200
+
201
+ output_dir = sys.argv[1]
202
+ abs_min_var = int(sys.argv[2])
203
+ abs_max_var = int(sys.argv[3])
204
+ run(output_dir, abs_min_var, abs_max_var)