PyamilySeq 0.0.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,647 @@
1
+ #from line_profiler_pycharm import profile
2
+
3
+ from collections import OrderedDict,defaultdict
4
+ import copy
5
+ import math
6
+ import sys
7
+ import argparse
8
+ import os
9
+
10
+ try:
11
+ from .Constants import *
12
+ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
13
+ from Constants import *
14
+
15
+
16
+ def custom_sort_key(k, dict1, dict2):
17
+ return (len(dict1[k]), len(dict2[k]))
18
+
19
+ def sort_keys_by_values(dict1, dict2):
20
+ sorted_keys = sorted(dict1.keys(), key=lambda k: custom_sort_key(k, dict1, dict2), reverse=True)
21
+ return sorted_keys
22
+
23
+ def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
24
+ print("Outputting gene_presence_absence file")
25
+ in_name = options.clusters.split('.')[0]
26
+ gpa_outfile = open(in_name+'_gene_presence_absence.csv','w')
27
+ gpa_outfile.write('"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","'
28
+ '"Accessory Fragment","Accessory Order with Fragment","QC","Min group size nuc","Max group size nuc","Avg group size nuc","')
29
+ gpa_outfile.write('","'.join(genome_dict.keys()))
30
+ gpa_outfile.write('"\n')
31
+ for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
32
+ average_sequences_per_genome = len(sequences) / len(pangenome_clusters_First_sorted[cluster])
33
+ gpa_outfile.write('"group_'+str(cluster)+'","","'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
34
+ '","","","","","","","","",""')
35
+
36
+ full_out = ''
37
+ for genome in genome_dict.keys():
38
+ tmp_list = []
39
+ for value in sequences:
40
+ if value.split('|')[0] == genome:
41
+ tmp_list.append(value)
42
+ if tmp_list:
43
+ full_out += ',"'+''.join(tmp_list)+'"'
44
+ gpa_outfile.write(full_out)
45
+ gpa_outfile.write('\n')
46
+
47
+ ### Below is some unfinished code
48
+ # edge_list_outfile = open(in_name+'_edge_list.csv','w')
49
+ # for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
50
+ # output = []
51
+ # for entry in sequences:
52
+ # # Split each entry at '|'
53
+ # genome, gene = entry.split('|')
54
+ # # Format the result as "gene genome"
55
+ # output.append(f"{gene}\t{genome}")
56
+ # for line in output:
57
+ # edge_list_outfile.write(line + '\n')
58
+
59
+
60
+ def wrap_sequence(sequence, width=60):
61
+ wrapped_sequence = []
62
+ for i in range(0, len(sequence), width):
63
+ wrapped_sequence.append(sequence[i:i + width])
64
+ return "\n".join(wrapped_sequence)
65
+
66
+
67
+ def read_fasta(fasta_file):
68
+ sequences = {}
69
+ current_sequence = None
70
+ with open(fasta_file, 'r') as file:
71
+ for line in file:
72
+ line = line.strip()
73
+ if not line:
74
+ continue # Skip empty lines
75
+ if line.startswith('>'):
76
+ current_sequence = line[1:] # Remove '>' character
77
+ sequences[current_sequence] = ''
78
+ else:
79
+ sequences[current_sequence] += line
80
+ return sequences
81
+
82
+
83
+ def reorder_dict_by_keys(original_dict, sorted_keys):
84
+ return {k: original_dict[k] for k in sorted_keys}
85
+
86
+ def get_cores(options,genome_dict):
87
+ ##Calculate core groups
88
+ groups = OrderedDict()
89
+ cores = OrderedDict()
90
+ prev_top = len(genome_dict)
91
+ first = True
92
+ for group in options.core_groups.split(','):
93
+ calculated_floor = math.floor(int(group) / 100 * len(genome_dict))
94
+ if first == False:
95
+ groups[group] = (calculated_floor,prev_top -1)
96
+ else:
97
+ groups[group] = (calculated_floor, prev_top)
98
+ first = False
99
+ prev_top = calculated_floor
100
+ first_core_group = 'first_core_' + group
101
+ cores[first_core_group] = []
102
+ if options.reclustered != None:
103
+ extended_core_group = 'extended_core_' + group
104
+ cores[extended_core_group] = []
105
+ combined_core_group = 'combined_core_' + group
106
+ cores[combined_core_group] = []
107
+ second_core_group = 'second_core_' + group
108
+ cores[second_core_group] = []
109
+ only_second_core_group = 'only_second_core_' + group
110
+ cores[only_second_core_group] = []
111
+ return cores, groups
112
+
113
+ #@profile
114
+ def calc_First_only_core(cluster, pep_num, groups, cores):
115
+ groups_as_list = list(groups.values())
116
+ for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= pep_num <= fir):
117
+ res = idx
118
+ family_group = list(groups)[res]
119
+ cores['first_core_'+family_group].append(cluster)
120
+
121
+ #@profile
122
+ def calc_single_First_extended_Second_only_core(pep_num, groups, cores, second_num): # Count gene families extended with StORFs
123
+ groups_as_list = list(groups.values())
124
+ for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= pep_num+second_num <= fir):
125
+ res = idx
126
+ family_group = list(groups)[res]
127
+ cores['extended_core_' + family_group].append(pep_num)
128
+
129
+
130
+ #@profile
131
+ def calc_multi_First_extended_Second_only_core(pep_num, groups, cores, second_num): # Count seperately those gene families extended with StORF_Reporter but combined >1 PEP
132
+ groups_as_list = list(groups.values())
133
+ for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= pep_num+second_num <= fir):
134
+ res = idx
135
+ family_group = list(groups)[res]
136
+ cores['combined_core_' + family_group] += 1
137
+
138
+
139
+ #@profile
140
+ def calc_Second_only_core(groups, cores, second_num):
141
+ groups_as_list = list(groups.values())
142
+ for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= second_num <= fir):
143
+ res = idx
144
+ family_group = list(groups)[res]
145
+ cores['second_core_' + family_group] += 1
146
+
147
+ #@profile
148
+ def calc_only_Second_only_core(groups, cores, second_num): # only count the true storf onlies
149
+ groups_as_list = list(groups.values())
150
+ for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= second_num <= fir):
151
+ res = idx
152
+ family_group = list(groups)[res]
153
+ cores['only_second_core_' + family_group] += 1
154
+
155
+
156
+
157
+
158
+
159
+ #@profile
160
+ def combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered):
161
+ num_clustered_First = defaultdict(list)
162
+ pangenome_clusters_Type = copy.deepcopy(pangenome_clusters_First)
163
+ list_of_reps = list(reps.keys())
164
+ for cluster, pep_genomes in pangenome_clusters_First.items():
165
+ rep = list_of_reps[int(cluster)] # get the rep of the current pep cluster
166
+ Com_PEP_Genomes = 0
167
+ Seconds = 0
168
+ seen_Seconds = []
169
+ added_Second_genomes = 0
170
+ try: # get the cluster from the storf clusters which contains this rep
171
+ clustered_combined = combined_pangenome_clusters_First_Second_clustered[rep] # Not true clusters - I put a PEP as key myself
172
+ seen_clust_Genomes = []
173
+ num_clustered_First[cluster].append(rep + '_' + str(len(pep_genomes)))
174
+ for clust in clustered_combined:
175
+ if options.sequence_tag not in clust: # Not good enough at the moment
176
+ clust_Genome = clust.split('|')[0]
177
+ if clust_Genome not in seen_clust_Genomes:
178
+ seen_clust_Genomes.append(clust_Genome)
179
+ if clust_Genome not in pep_genomes:
180
+ Com_PEP_Genomes += 1
181
+ num_clustered_First[cluster].append(clust + '_' + str(reps[clust][1]))
182
+ elif options.sequence_tag in clust:
183
+ Seconds += 1
184
+ clust_Genome = clust.split('|')[0]
185
+ if clust_Genome not in seen_Seconds:
186
+ seen_Seconds.append(clust_Genome)
187
+ if clust_Genome not in seen_clust_Genomes:
188
+ seen_clust_Genomes.append(clust_Genome)
189
+ if clust_Genome not in pep_genomes:
190
+ added_Second_genomes += 1
191
+ else:
192
+ sys.exit("Error: looking for sequence_tag")
193
+
194
+ size_of_pep_clusters = []
195
+ peps = num_clustered_First[cluster]
196
+ for pep in peps:
197
+ pep = pep.rsplit('_', 1)
198
+ size_of_pep_clusters.append(int(pep[1]))
199
+ pangenome_clusters_Type[cluster] = [len(num_clustered_First[cluster]), sum(size_of_pep_clusters),
200
+ size_of_pep_clusters, added_Second_genomes, Seconds, len(seen_Seconds)]
201
+
202
+ except KeyError:
203
+ ###Singleton
204
+ num_pep_genomes = [len(pep_genomes)]
205
+ pangenome_clusters_Type[cluster] = [1, len(pep_genomes), num_pep_genomes, added_Second_genomes, Seconds,
206
+ len(seen_Seconds)]
207
+
208
+ return pangenome_clusters_Type
209
+
210
+ #@profile
211
+ def single_clustering_counting(options, pangenome_clusters_First, reps):
212
+ num_clustered_PEP = defaultdict(list)
213
+ recorded_PEP = []
214
+ pangenome_clusters_Type = copy.deepcopy(pangenome_clusters_First)
215
+ list_of_reps = list(reps.keys())
216
+ for cluster, pep_genomes in pangenome_clusters_First.items():
217
+ rep = list_of_reps[int(cluster)] # get the rep of the current pep cluster
218
+
219
+ try: # get the cluster from the storf clusters which contains this rep
220
+ num_clustered_PEP[cluster].append(rep + '_' + str(len(pep_genomes)))
221
+ size_of_pep_clusters = []
222
+ peps = num_clustered_PEP[cluster]
223
+ for pep in peps:
224
+ pep = pep.rsplit('_', 1)
225
+ size_of_pep_clusters.append(int(pep[1]))
226
+ recorded_PEP.append(pep[0])
227
+ pangenome_clusters_Type[cluster] = [len(num_clustered_PEP[cluster]), sum(size_of_pep_clusters),
228
+ size_of_pep_clusters, 0, 0, 0]
229
+
230
+ except KeyError:
231
+ ###Singleton
232
+ num_pep_genomes = [len(pep_genomes)]
233
+ pangenome_clusters_Type[cluster] = [1, len(pep_genomes), num_pep_genomes, 0, 0, 0]
234
+
235
+ return pangenome_clusters_Type
236
+
237
+
238
+
239
+ #@profile
240
+ def combined_clustering_CDHIT(options, genome_dict):
241
+ unique_genomes = []
242
+ Second_in = open(options.reclustered, 'r')
243
+ combined_pangenome_clusters_First = OrderedDict()
244
+ combined_pangenome_clusters_First_sequences = OrderedDict()
245
+ combined_pangenome_clusters_Second = OrderedDict()
246
+ combined_pangenome_clusters_Second_sequences = OrderedDict()
247
+ combined_pangenome_clusters_First_Second_clustered = OrderedDict()
248
+
249
+ not_Second_only_cluster_ids = []
250
+ already_seen_PEP = []
251
+ Combined_clusters = OrderedDict()
252
+ Combined_reps = OrderedDict()
253
+ first = True
254
+ for line in Second_in:
255
+ if line.startswith('>'):
256
+ if first == False:
257
+ cluster_size = len(Combined_clusters[cluster_id])
258
+ Combined_reps.update({rep: cluster_size})
259
+ for pep in combined_pangenome_clusters_First_sequences[cluster_id]:
260
+ if pep != []:
261
+ if pep in already_seen_PEP:
262
+ continue
263
+ else:
264
+ already_seen_PEP.append(pep)
265
+ if len(combined_pangenome_clusters_Second_sequences[cluster_id]) > 0 and len(combined_pangenome_clusters_First_sequences[cluster_id]) > 0:
266
+ if len(combined_pangenome_clusters_First_sequences[cluster_id]) > 1: # If we have clustered >1 PEP family, we need to record 1 as key and all others are val
267
+ all_but_first = combined_pangenome_clusters_First_sequences[cluster_id][1:]
268
+ storfs_clustered = combined_pangenome_clusters_Second_sequences[cluster_id]
269
+ VALUE = all_but_first + storfs_clustered
270
+ else:
271
+ VALUE = combined_pangenome_clusters_Second_sequences[cluster_id]
272
+ KEY = combined_pangenome_clusters_First_sequences[cluster_id][0]
273
+ combined_pangenome_clusters_First_Second_clustered.update({KEY: VALUE})
274
+ cluster_id = line.strip('>')
275
+ cluster_id = cluster_id.strip('\n')
276
+ cluster_id = cluster_id.split(' ')[1]
277
+ Combined_clusters.update({cluster_id: []})
278
+ combined_pangenome_clusters_First.update({cluster_id: []})
279
+ combined_pangenome_clusters_First_sequences.update({cluster_id: []})
280
+ combined_pangenome_clusters_Second.update({cluster_id: []})
281
+ combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
282
+
283
+ first = False
284
+ else:
285
+ clustered = line.split('\t')[1]
286
+ clustered = clustered.split('>')[1]
287
+ clustered = clustered.split('...')[0]
288
+ genome = clustered.split('|')[0]
289
+ genome_dict[genome] += 1
290
+ if '*' in line:
291
+ rep = clustered
292
+ Combined_reps.update({rep: 0})
293
+ if first == False:
294
+ Combined_clusters[cluster_id].append(clustered)
295
+ clustered_genome = clustered.split('|')[0]
296
+ if options.sequence_tag in line:
297
+ if clustered_genome not in combined_pangenome_clusters_Second[cluster_id]:
298
+ combined_pangenome_clusters_Second[cluster_id].append(clustered_genome)
299
+ combined_pangenome_clusters_Second_sequences[cluster_id].append(clustered)
300
+ else:
301
+ if cluster_id not in not_Second_only_cluster_ids:
302
+ not_Second_only_cluster_ids.append(cluster_id) # Tell us which StORF_Reporter clustered are unmatched to a PEP
303
+ if clustered_genome not in combined_pangenome_clusters_First[cluster_id]:
304
+ combined_pangenome_clusters_First[cluster_id].append(clustered_genome)
305
+ combined_pangenome_clusters_First_sequences[cluster_id].append(clustered)
306
+
307
+
308
+ return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second, unique_genomes
309
+
310
+ def combined_clustering_Edge_List(options, genome_dict):
311
+ if options.format == 'TSV':
312
+ separator = '\t'
313
+ elif options.format == 'CSV':
314
+ separator = ','
315
+ unique_genomes = []
316
+ cluster_id = 0
317
+ last_rep = ''
318
+ Second_in = open(options.reclustered, 'r')
319
+ combined_pangenome_clusters_First = OrderedDict()
320
+ combined_pangenome_clusters_First_sequences = OrderedDict()
321
+ combined_pangenome_clusters_Second = OrderedDict()
322
+ combined_pangenome_clusters_Second_sequences = OrderedDict()
323
+ combined_pangenome_clusters_First_Second_clustered = OrderedDict()
324
+
325
+ not_Second_only_cluster_ids = []
326
+ already_seen_PEP = []
327
+ Combined_clusters = OrderedDict()
328
+ Combined_reps = OrderedDict()
329
+ first = True
330
+ for line in Second_in:
331
+ rep, child = line.strip().split(separator)
332
+ child_genome = child.split('|')[0] # Extracting the genome identifier from the child sequence
333
+
334
+ if first == True:
335
+ Combined_clusters.update({cluster_id: []})
336
+ combined_pangenome_clusters_First.update({cluster_id: []})
337
+ combined_pangenome_clusters_First_sequences.update({cluster_id: []})
338
+ combined_pangenome_clusters_Second.update({cluster_id: []})
339
+ combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
340
+ Combined_reps.update({rep: 0})
341
+ first = False
342
+
343
+ if first == False:
344
+ if rep != last_rep and last_rep != '':
345
+ cluster_size = len(Combined_clusters[cluster_id])
346
+ Combined_reps.update({rep: cluster_size})
347
+ for pep in combined_pangenome_clusters_First_sequences[cluster_id]:
348
+ if pep != []:
349
+ if pep in already_seen_PEP:
350
+ continue
351
+ else:
352
+ already_seen_PEP.append(pep)
353
+ if len(combined_pangenome_clusters_Second_sequences[cluster_id]) > 0 and len(combined_pangenome_clusters_First_sequences[cluster_id]) > 0:
354
+ if len(combined_pangenome_clusters_First_sequences[cluster_id]) > 1: # If we have clustered >1 PEP family, we need to record 1 as key and all others are val
355
+ all_but_first = combined_pangenome_clusters_First_sequences[cluster_id][1:]
356
+ storfs_clustered = combined_pangenome_clusters_Second_sequences[cluster_id]
357
+ VALUE = all_but_first + storfs_clustered
358
+ else:
359
+ VALUE = combined_pangenome_clusters_Second_sequences[cluster_id]
360
+ KEY = combined_pangenome_clusters_First_sequences[cluster_id][0]
361
+ combined_pangenome_clusters_First_Second_clustered.update({KEY: VALUE})
362
+
363
+ cluster_id += 1
364
+ Combined_clusters.update({cluster_id: []})
365
+ combined_pangenome_clusters_First.update({cluster_id: []})
366
+ combined_pangenome_clusters_First_sequences.update({cluster_id: []})
367
+ combined_pangenome_clusters_Second.update({cluster_id: []})
368
+ combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
369
+ Combined_reps.update({rep: 0})
370
+
371
+
372
+ Combined_clusters[cluster_id].append(child)
373
+ if options.sequence_tag in line:
374
+ if child_genome not in combined_pangenome_clusters_Second[cluster_id]:
375
+ combined_pangenome_clusters_Second[cluster_id].append(child_genome)
376
+ combined_pangenome_clusters_Second_sequences[cluster_id].append(child)
377
+ else:
378
+ if cluster_id not in not_Second_only_cluster_ids:
379
+ not_Second_only_cluster_ids.append(cluster_id) # Tell us which StORF_Reporter clustered are unmatched to a PEP
380
+ if child_genome not in combined_pangenome_clusters_First[cluster_id]:
381
+ combined_pangenome_clusters_First[cluster_id].append(child_genome)
382
+ combined_pangenome_clusters_First_sequences[cluster_id].append(child)
383
+
384
+ last_rep = rep
385
+
386
+ return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second, unique_genomes
387
+
388
+
389
+ def cluster_EdgeList(options):
390
+ if options.format == 'TSV':
391
+ separator = '\t'
392
+ elif options.format == 'CSV':
393
+ separator = ','
394
+ cluster_id = 0
395
+ last_rep = ''
396
+ first = True
397
+ First_in = open(options.clusters, 'r')
398
+ pangenome_clusters_First = OrderedDict()
399
+ pangenome_clusters_First_sequences = OrderedDict()
400
+ genome_dict = defaultdict(int)
401
+ reps = OrderedDict()
402
+ for line in First_in:
403
+ rep, child = line.strip().split(separator)
404
+ child_genome = child.split('|')[0] # Extracting the genome identifier from the child sequence
405
+ # Counting occurrences of genomes
406
+ genome_dict[child_genome] += 1
407
+ if first == True:
408
+ pangenome_clusters_First[0] = []
409
+ pangenome_clusters_First_sequences[0] = []
410
+ first = False
411
+
412
+ if rep != last_rep and last_rep != '':
413
+ cluster_id +=1
414
+ pangenome_clusters_First[cluster_id] = []
415
+ pangenome_clusters_First_sequences[cluster_id] = []
416
+ cluster_size = len(pangenome_clusters_First_sequences[cluster_id-1])
417
+ reps.update({last_rep: [cluster_size, len(pangenome_clusters_First[cluster_id-1])]})
418
+ pangenome_clusters_First[cluster_id] = []
419
+ pangenome_clusters_First_sequences[cluster_id] = []
420
+ if child_genome not in pangenome_clusters_First[cluster_id]:
421
+ pangenome_clusters_First[cluster_id].append(child_genome)
422
+
423
+ pangenome_clusters_First_sequences[cluster_id].append(child)
424
+ last_rep = rep
425
+ cluster_size = len(pangenome_clusters_First_sequences[cluster_id])
426
+ reps.update({rep: [cluster_size, len(pangenome_clusters_First[cluster_id])]})
427
+
428
+
429
+ return genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps
430
+
431
+
432
+
433
+ def cluster_CDHIT(options):
434
+ First_in = open(options.clusters, 'r')
435
+ clusters = OrderedDict()
436
+ pangenome_clusters_First = OrderedDict()
437
+ pangenome_clusters_First_sequences = OrderedDict()
438
+ first = True
439
+ genome_dict = defaultdict(int)
440
+ reps = OrderedDict()
441
+ ## Load in all data for easier reuse later
442
+ for line in First_in:
443
+ if line.startswith('>'):
444
+ if first == False:
445
+ cluster_size = len(clusters[cluster_id])
446
+ reps.update({rep: [cluster_size, len(pangenome_clusters_First[cluster_id])]})
447
+ cluster_id = line.strip('>')
448
+ cluster_id = cluster_id.strip('\n')
449
+ cluster_id = cluster_id.split(' ')[1]
450
+ clusters.update({cluster_id: []})
451
+ pangenome_clusters_First.update({cluster_id: []})
452
+ pangenome_clusters_First_sequences.update({cluster_id: []})
453
+
454
+ first = False
455
+ else:
456
+ clustered = line.split('\t')[1]
457
+ clustered = clustered.split('>')[1]
458
+ clustered = clustered.split('...')[0]
459
+ genome = clustered.split('|')[0]
460
+ genome_dict[genome] += 1
461
+ if '*' in line:
462
+ rep = clustered
463
+ reps.update({rep: [0, 0]})
464
+ if first == False:
465
+ clusters[cluster_id].append(clustered)
466
+ clustered_genome = clustered.split('|')[0]
467
+ if clustered_genome not in pangenome_clusters_First[cluster_id]:
468
+ pangenome_clusters_First[cluster_id].append(clustered_genome)
469
+ pangenome_clusters_First_sequences[cluster_id].append(clustered)
470
+ return genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps
471
+
472
+ #@profile
473
+ def cluster(options):
474
+
475
+ if options.format == 'CD-HIT':
476
+ genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options)
477
+ elif options.format in ['TSV','CSV']:
478
+ genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_EdgeList(options)
479
+
480
+ ######################################
481
+ cores, groups = get_cores(options, genome_dict)
482
+ ###
483
+
484
+ if options.reclustered != None:
485
+ if options.format == 'CD-HIT':
486
+ combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second,\
487
+ unique_genomes = combined_clustering_CDHIT(options, genome_dict)
488
+ if options.format == 'TSV':
489
+ combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second,\
490
+ unique_genomes = combined_clustering_Edge_List(options, genome_dict)
491
+ pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered)
492
+ else:
493
+ pangenome_clusters_Type = single_clustering_counting(options, pangenome_clusters_First, reps)
494
+
495
+
496
+ counter = 0
497
+ Number_Of_StORF_Extending_But_Same_Genomes = 0
498
+
499
+ sorted_first_keys = sort_keys_by_values(pangenome_clusters_First, pangenome_clusters_First_sequences)
500
+ pangenome_clusters_First_sorted = reorder_dict_by_keys(pangenome_clusters_First, sorted_first_keys)
501
+ pangenome_clusters_First_sequences_sorted = reorder_dict_by_keys(pangenome_clusters_First_sequences, sorted_first_keys)
502
+ pangenome_clusters_Type_sorted = reorder_dict_by_keys(pangenome_clusters_Type, sorted_first_keys)
503
+
504
+ print("Calculating Groups")
505
+ for cluster, numbers in pangenome_clusters_Type_sorted.items():
506
+ ############################### Calculate First only
507
+ if numbers[0] == 1 and numbers[1] >=2:
508
+ calc_First_only_core(cluster, numbers[1],groups,cores)
509
+ counter +=1
510
+ elif numbers[0] >1 and numbers[1] >=2:
511
+ calc_First_only_core(cluster, numbers[2][0],groups,cores)
512
+ counter += 1
513
+
514
+ if options.reclustered != None:
515
+ ############################# Calculate First and Reclustered-Second
516
+ if numbers[0] == 1 and numbers[3] >= 1: # If Seconds did not combine First reps
517
+ calc_single_First_extended_Second_only_core(numbers[1], groups, cores, numbers[3])
518
+ elif numbers[0] > 1 and numbers[3] >= 1: # If unique Secondss combined multiple Firsts
519
+ calc_multi_First_extended_Second_only_core(numbers[1], groups, cores, numbers[3])
520
+ elif numbers[4] >= 1:
521
+ Number_Of_StORF_Extending_But_Same_Genomes += 1
522
+ combined_pangenome_clusters_ONLY_Second_Type = defaultdict(list)
523
+ combined_pangenome_clusters_Second_Type = defaultdict(list)
524
+ for cluster, genomes in combined_pangenome_clusters_Second.items():
525
+ if cluster in not_Second_only_cluster_ids:
526
+ combined_pangenome_clusters_Second_Type[cluster] = [cluster, len(genomes)]
527
+ else:
528
+ combined_pangenome_clusters_ONLY_Second_Type[cluster] = [cluster, len(genomes)]
529
+ for cluster, data in combined_pangenome_clusters_Second_Type.items():
530
+ calc_Second_only_core(groups, cores, data[1])
531
+ for cluster, data in combined_pangenome_clusters_ONLY_Second_Type.items():
532
+ if data[1] >= 2:
533
+ calc_only_Second_only_core(groups, cores, data[1])
534
+ ###########################
535
+ print("End")
536
+ key_order = ['first_core_', 'extended_core_', 'combined_core_', 'second_core_','only_second_core_']
537
+ print("Gene Family Groups:")
538
+ for key_prefix in key_order:
539
+ for key, value in cores.items():
540
+ if key.startswith(key_prefix):
541
+ print(f"{key}: {len(value)}")
542
+
543
+ if options.gene_presence_absence_out != None:
544
+ gene_presence_absence_output(options,genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted)
545
+
546
+ if options.write_families != None and options.fasta != None:
547
+ sequences = read_fasta(options.fasta)
548
+ input_dir = os.path.dirname(os.path.abspath(options.clusters))
549
+ output_dir = os.path.join(input_dir, 'Gene_Families_Output')
550
+
551
+ # Create output directory if it doesn't exist
552
+ if not os.path.exists(output_dir):
553
+ os.makedirs(output_dir)
554
+ for key_prefix in key_order:
555
+ for key, values in cores.items():
556
+ if any(part in options.write_families.split(',') for part in key.split('_')):
557
+ if key.startswith(key_prefix):
558
+ for value in values:
559
+ output_filename = f"{key}_{value}.fasta"
560
+ sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
561
+ # Write sequences to output file that are in the sequences dictionary
562
+ with open(os.path.join(output_dir, output_filename), 'w') as outfile:
563
+ for header in sequences_to_write:
564
+ if header in sequences:
565
+ outfile.write(f">{header}\n")
566
+ wrapped_sequence = wrap_sequence(sequences[header])
567
+ outfile.write(f"{wrapped_sequence}\n")
568
+
569
+
570
+
571
+ def main():
572
+
573
+ parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': PyamilySeq Run Parameters.')
574
+ parser._action_groups.pop()
575
+
576
+ required = parser.add_argument_group('Required Arguments')
577
+ required.add_argument('-c', action='store', dest='clusters', help='Clustering output file from CD-HIT, TSV or CSV Edge List',
578
+ required=True)
579
+ required.add_argument('-f', action='store', dest='format', choices=['CD-HIT', 'CSV', 'TSV'],
580
+ help='Which format to use (CD-HIT or Comma/Tab Separated Edge-List (such as MMseqs2 tsv output))', required=True)
581
+
582
+ output_args = parser.add_argument_group('Output Parameters')
583
+ output_args.add_argument('-w', action="store", dest='write_families', default="99",
584
+ help='Default - No output: Output sequences of identified families (provide levels at which to output "-w 99 95"'
585
+ ' - Must provide FASTA file with -fasta')
586
+ output_args.add_argument('-fasta', action='store', dest='fasta',
587
+ help='FASTA file to use in conjunction with "-w"',
588
+ required=False)
589
+
590
+ optional = parser.add_argument_group('Optional Arguments')
591
+ optional.add_argument('-rc', action='store', dest='reclustered', help='Clustering output file from secondary round of clustering',
592
+ required=False)
593
+ optional.add_argument('-st', action='store', dest='sequence_tag', help='Default - "StORF": Unique identifier to be used to distinguish the second of two rounds of clustered sequences',
594
+ required=False)
595
+ optional.add_argument('-groups', action="store", dest='core_groups', default="99,95,90,80,15",
596
+ help='Default - (\'99,95,90,80,15\'): Gene family groups to use')
597
+ optional.add_argument('-gpa', action='store', dest='gene_presence_absence_out', help='Default - False: If selected, a Roary formatted gene_presence_absence.csv will be created - Required for Coinfinder and other downstream tools',
598
+ required=False)
599
+
600
+ misc = parser.add_argument_group('Misc')
601
+ misc.add_argument('-verbose', action='store', dest='verbose', default=False, type=eval, choices=[True, False],
602
+ help='Default - False: Print out runtime messages')
603
+ misc.add_argument('-v', action='store_true', dest='version',
604
+ help='Default - False: Print out version number and exit')
605
+
606
+
607
+ options = parser.parse_args()
608
+ if options.clusters == None or options.format == None:
609
+ if options.version:
610
+ sys.exit(PyamilySeq_Version)
611
+ else:
612
+ exit('PyamilySeq: error: the following arguments are required: -c, -f')
613
+
614
+ if options.sequence_tag == None:
615
+ options.sequence_tag = 'StORF'
616
+
617
+
618
+ if options.write_families != None and options.fasta == False:
619
+ exit("-fasta must br provided if -w is used")
620
+
621
+
622
+ options.clusters = os.path.normpath(options.clusters)
623
+ options.clusters = os.path.realpath(options.clusters)
624
+ if options.reclustered:
625
+ options.reclustered = os.path.normpath(options.reclustered)
626
+ options.reclustered = os.path.realpath(options.reclustered)
627
+
628
+
629
+ options.core_groups = options.core_groups + ',0'
630
+
631
+ cluster(options)
632
+
633
+
634
+
635
+
636
+
637
+ print("Thank you for using PyamilySeq -- A detailed user manual can be found at https://github.com/NickJD/PyamilySeq\n"
638
+ "Please report any issues to: https://github.com/NickJD/PyamilySeq/issues\n#####")
639
+
640
+
641
+
642
+
643
+
644
+ if __name__ == "__main__":
645
+ main()
646
+ print("Complete")
647
+
PyamilySeq/__init__.py ADDED
File without changes