PyamilySeq 0.5.2__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,127 +1,42 @@
1
1
  #from line_profiler_pycharm import profile
2
2
 
3
- from collections import OrderedDict,defaultdict
4
- import copy
5
- import math
6
- import sys
7
- from tempfile import NamedTemporaryFile
8
-
9
-
10
3
 
11
4
  try:
12
5
  from .Constants import *
6
+ from .clusterings import *
13
7
  from .utils import *
14
8
  except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
15
9
  from Constants import *
10
+ from clusterings import *
16
11
  from utils import *
17
12
 
18
13
 
19
- def custom_sort_key(k, dict1, dict2):
20
- return (len(dict1[k]), len(dict2[k]))
21
-
22
- def sort_keys_by_values(dict1, dict2):
23
- sorted_keys = sorted(dict1.keys(), key=lambda k: custom_sort_key(k, dict1, dict2), reverse=True)
24
- return sorted_keys
25
-
26
- def select_longest_gene(sequences):
27
- """Select the longest sequence for each genome."""
28
- longest_sequences = {}
29
- for seq_id, sequence in sequences.items():
30
- genome = seq_id.split('|')[0] # Assuming genome name can be derived from the sequence ID
31
- if genome not in longest_sequences or len(sequence) > len(longest_sequences[genome][1]):
32
- longest_sequences[genome] = (seq_id, sequence)
33
- return longest_sequences
34
-
35
-
36
- def run_mafft_on_sequences(options, sequences, output_file):
37
- print("Conducting MAFFT alignment.")
38
- """Run mafft on the given sequences and write to output file."""
39
- # Create a temporary input file for mafft
40
- with NamedTemporaryFile('w', delete=False) as temp_input_file:
41
- for header, sequence in sequences.items():
42
- temp_input_file.write(f">{header}\n{sequence}\n")
43
- temp_input_file_path = temp_input_file.name
44
-
45
- # Run mafft
46
- try:
47
- with open(output_file, 'w') as output_f:
48
- if options.verbose == True:
49
- subprocess.run(
50
- ['mafft', '--auto', temp_input_file_path],
51
- stdout=output_f,
52
- stderr=sys.stderr,
53
- check=True
54
- )
55
- else:
56
- subprocess.run(
57
- ['mafft', '--auto', temp_input_file_path],
58
- stdout=output_f,
59
- stderr=subprocess.DEVNULL, # Suppress stderr
60
- check=True
61
- )
62
- finally:
63
- os.remove(temp_input_file_path) # Clean up the temporary file
64
-
65
-
66
- def process_gene_families(options, directory, output_file):
67
- """Process each gene family file to select the longest sequence per genome and concatenate aligned sequences."""
68
- concatenated_sequences = {}
69
- output_file = directory.replace('Gene_Families_Output',output_file)
70
-
71
- # Iterate over each gene family file
72
- for gene_file in os.listdir(directory):
73
- if gene_file.endswith('.fasta'):
74
- gene_path = os.path.join(directory, gene_file)
75
-
76
- # Read sequences from the gene family file
77
- sequences = read_fasta(gene_path)
78
-
79
- # Select the longest sequence for each genome
80
- longest_sequences = select_longest_gene(sequences)
81
-
82
- # Run mafft on the longest sequences
83
- aligned_file = f"{gene_file}_aligned.fasta"
84
- run_mafft_on_sequences(options, {seq_id: seq for seq_id, seq in longest_sequences.values()}, aligned_file)
85
-
86
- # Read aligned sequences and concatenate them
87
- aligned_sequences = read_fasta(aligned_file)
88
- for genome, aligned_seq in aligned_sequences.items():
89
- genome_name = genome.split('|')[0]
90
- if genome_name not in concatenated_sequences:
91
- concatenated_sequences[genome_name] = ""
92
- concatenated_sequences[genome_name] += aligned_seq
93
-
94
- # Clean up aligned file
95
- os.remove(aligned_file)
96
-
97
- # Write the concatenated sequences to the output file
98
- with open(output_file, 'w') as out:
99
- for genome, sequence in concatenated_sequences.items():
100
- out.write(f">{genome}\n")
101
- wrapped_sequence = wrap_sequence(sequence, 60)
102
- out.write(f"{wrapped_sequence}\n")
103
-
104
- def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
14
+ def gene_presence_absence_output(options, genus_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
105
15
  print("Outputting gene_presence_absence file")
106
- in_name = options.clusters.split('.')[0]
107
- gpa_outfile = open(in_name+'_gene_presence_absence.csv','w')
16
+ output_dir = os.path.abspath(options.output_dir)
17
+ in_name = options.clusters.split('.')[0].split('/')[-1]
18
+ gpa_outfile = os.path.join(output_dir, in_name)
19
+ gpa_outfile = open(gpa_outfile+'_gene_presence_absence.csv','w')
108
20
  gpa_outfile.write('"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","'
109
21
  '"Accessory Fragment","Accessory Order with Fragment","QC","Min group size nuc","Max group size nuc","Avg group size nuc","')
110
- gpa_outfile.write('","'.join(genome_dict.keys()))
22
+ gpa_outfile.write('","'.join(genus_dict.keys()))
111
23
  gpa_outfile.write('"\n')
112
24
  for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
113
25
  average_sequences_per_genome = len(sequences) / len(pangenome_clusters_First_sorted[cluster])
114
26
  gpa_outfile.write('"group_'+str(cluster)+'","","'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
115
27
  '","","","","","","","","",""')
116
28
 
117
- full_out = ''
118
- for genome in genome_dict.keys():
29
+
30
+ for genus in genus_dict.keys():
31
+ full_out = ''
119
32
  tmp_list = []
120
33
  for value in sequences:
121
- if value.split('|')[0] == genome:
34
+ if value.split('_')[0] == genus:
122
35
  tmp_list.append(value)
123
36
  if tmp_list:
124
37
  full_out += ',"'+''.join(tmp_list)+'"'
38
+ else:
39
+ full_out = ',""'
125
40
  gpa_outfile.write(full_out)
126
41
  gpa_outfile.write('\n')
127
42
 
@@ -138,448 +53,85 @@ def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_
138
53
  # edge_list_outfile.write(line + '\n')
139
54
 
140
55
 
141
- def wrap_sequence(sequence, width=60):
142
- wrapped_sequence = []
143
- for i in range(0, len(sequence), width):
144
- wrapped_sequence.append(sequence[i:i + width])
145
- return "\n".join(wrapped_sequence)
146
-
147
-
148
- def read_fasta(fasta_file):
149
- sequences = {}
150
- current_sequence = None
151
- with open(fasta_file, 'r') as file:
152
- for line in file:
153
- line = line.strip()
154
- if not line:
155
- continue # Skip empty lines
156
- if line.startswith('>'):
157
- current_sequence = line[1:] # Remove '>' character
158
- sequences[current_sequence] = ''
159
- else:
160
- sequences[current_sequence] += line
161
- return sequences
162
-
163
56
 
164
- def reorder_dict_by_keys(original_dict, sorted_keys):
165
- return {k: original_dict[k] for k in sorted_keys}
166
57
 
167
- def get_cores(options,genome_dict):
58
+ def get_cores(options):
168
59
  ##Calculate core groups
169
60
  groups = OrderedDict()
170
61
  cores = OrderedDict()
171
- prev_top = len(genome_dict)
172
- first = True
173
62
  for group in options.core_groups.split(','):
174
- calculated_floor = math.floor(int(group) / 100 * len(genome_dict))
175
- if first == False:
176
- # Ensure no overlap
177
- # if calculated_floor <= prev_top:
178
- # calculated_floor = prev_top - 1
179
-
180
- groups[group] = (calculated_floor,prev_top)
181
- else:
182
- groups[group] = (calculated_floor, prev_top)
183
- first = False
184
- prev_top = calculated_floor
185
- first_core_group = 'first_core_' + group
63
+ first_core_group = 'First_genera_' + group
186
64
  cores[first_core_group] = []
187
65
  if options.reclustered != None:
188
- extended_core_group = 'extended_core_' + group
66
+ extended_core_group = 'extended_genera_' + group
189
67
  cores[extended_core_group] = []
190
- combined_core_group = 'combined_core_' + group
68
+ combined_core_group = 'combined_genera_' + group
191
69
  cores[combined_core_group] = []
192
- second_core_group = 'second_core_' + group
70
+ second_core_group = 'Second_genera_' + group
193
71
  cores[second_core_group] = []
194
- only_second_core_group = 'only_second_core_' + group
72
+ only_second_core_group = 'only_Second_genera_' + group
195
73
  cores[only_second_core_group] = []
196
74
  return cores, groups
197
75
 
198
76
  #@profile
199
- def calc_First_only_core(cluster, pep_num, groups, cores):
200
- groups_as_list = list(groups.values())
201
- for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= pep_num <= fir):
202
- res = idx
203
- family_group = list(groups)[res]
204
- cores['first_core_'+family_group].append(cluster)
205
-
206
- #@profile
207
- def calc_single_First_extended_Second_only_core(pep_num, groups, cores, second_num): # Count gene families extended with StORFs
208
- groups_as_list = list(groups.values())
209
- for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= pep_num+second_num <= fir):
210
- res = idx
211
- family_group = list(groups)[res]
212
- cores['extended_core_' + family_group].append(pep_num)
213
-
214
-
215
- #@profile
216
- def calc_multi_First_extended_Second_only_core(pep_num, groups, cores, second_num): # Count seperately those gene families extended with StORF_Reporter but combined >1 PEP
217
- groups_as_list = list(groups.values())
218
- for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= pep_num+second_num <= fir):
219
- res = idx
220
- family_group = list(groups)[res]
221
- cores['combined_core_' + family_group] += 1
222
-
223
-
77
+ def calc_First_only_core(cluster, First_num, cores):
78
+ try:
79
+ cores['First_genera_' + str(First_num)].append(cluster)
80
+ except KeyError:
81
+ cores['First_genera_>'].append(cluster)
224
82
  #@profile
225
- def calc_Second_only_core(groups, cores, second_num):
226
- groups_as_list = list(groups.values())
227
- for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= second_num <= fir):
228
- res = idx
229
- family_group = list(groups)[res]
230
- cores['second_core_' + family_group] += 1
231
-
83
+ def calc_single_First_extended_Second_only_core(cluster, First_num, cores, Second_num): # Count gene families extended with StORFs
84
+ group = First_num + Second_num
85
+ try:
86
+ cores['extended_genera_' + str(group)].append(cluster)
87
+ except KeyError:
88
+ cores['extended_genera_>'].append(cluster)
232
89
  #@profile
233
- def calc_only_Second_only_core(groups, cores, second_num): # only count the true storf onlies
234
- groups_as_list = list(groups.values())
235
- for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= second_num <= fir):
236
- res = idx
237
- family_group = list(groups)[res]
238
- cores['only_second_core_' + family_group] += 1
239
-
240
-
241
-
242
-
243
-
90
+ def calc_multi_First_extended_Second_only_core(cluster, First_num, cores, Second_num): # Count seperately those gene families extended with StORF_Reporter but combined >1 PEP
91
+ group = First_num + Second_num
92
+ try:
93
+ cores['combined_genera_' + str(group)].append(cluster)
94
+ except KeyError:
95
+ cores['combined_genera_>'].append(cluster)
244
96
  #@profile
245
- def combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered):
246
- num_clustered_First = defaultdict(list)
247
- pangenome_clusters_Type = copy.deepcopy(pangenome_clusters_First)
248
- list_of_reps = list(reps.keys())
249
- for cluster, pep_genomes in pangenome_clusters_First.items():
250
- rep = list_of_reps[int(cluster)] # get the rep of the current pep cluster
251
- Com_PEP_Genomes = 0
252
- Seconds = 0
253
- seen_Seconds = []
254
- added_Second_genomes = 0
255
- try: # get the cluster from the storf clusters which contains this rep
256
- clustered_combined = combined_pangenome_clusters_First_Second_clustered[rep] # Not true clusters - I put a PEP as key myself
257
- seen_clust_Genomes = []
258
- num_clustered_First[cluster].append(rep + '_' + str(len(pep_genomes)))
259
- for clust in clustered_combined:
260
- if options.sequence_tag not in clust: # Not good enough at the moment
261
- clust_Genome = clust.split('|')[0]
262
- if clust_Genome not in seen_clust_Genomes:
263
- seen_clust_Genomes.append(clust_Genome)
264
- if clust_Genome not in pep_genomes:
265
- Com_PEP_Genomes += 1
266
- num_clustered_First[cluster].append(clust + '_' + str(reps[clust][1]))
267
- elif options.sequence_tag in clust:
268
- Seconds += 1
269
- clust_Genome = clust.split('|')[0]
270
- if clust_Genome not in seen_Seconds:
271
- seen_Seconds.append(clust_Genome)
272
- if clust_Genome not in seen_clust_Genomes:
273
- seen_clust_Genomes.append(clust_Genome)
274
- if clust_Genome not in pep_genomes:
275
- added_Second_genomes += 1
276
- else:
277
- sys.exit("Error: looking for sequence_tag")
278
-
279
- size_of_pep_clusters = []
280
- peps = num_clustered_First[cluster]
281
- for pep in peps:
282
- pep = pep.rsplit('_', 1)
283
- size_of_pep_clusters.append(int(pep[1]))
284
- pangenome_clusters_Type[cluster] = [len(num_clustered_First[cluster]), sum(size_of_pep_clusters),
285
- size_of_pep_clusters, added_Second_genomes, Seconds, len(seen_Seconds)]
286
-
287
- except KeyError:
288
- ###Singleton
289
- num_pep_genomes = [len(pep_genomes)]
290
- pangenome_clusters_Type[cluster] = [1, len(pep_genomes), num_pep_genomes, added_Second_genomes, Seconds,
291
- len(seen_Seconds)]
292
-
293
- return pangenome_clusters_Type
294
-
97
+ def calc_Second_only_core(cluster, cores, Second_num):
98
+ try:
99
+ cores['Second_genera_' + str(Second_num)].append(cluster)
100
+ except KeyError:
101
+ cores['Second_genera_>'].append(cluster)
295
102
  #@profile
296
- def single_clustering_counting(options, pangenome_clusters_First, reps):
297
- num_clustered_First = defaultdict(list)
298
- recorded_First = []
299
- pangenome_clusters_Type = copy.deepcopy(pangenome_clusters_First)
300
- list_of_reps = list(reps.keys())
301
- for cluster, First_genomes in pangenome_clusters_First.items():
302
- rep = list_of_reps[int(cluster)] # get the rep of the current pep cluster
303
-
304
- try: # get the cluster from the storf clusters which contains this rep
305
- num_clustered_First[cluster].append(rep + '_' + str(len(First_genomes)))
306
- size_of_First_clusters = []
307
- Firsts = num_clustered_First[cluster]
308
- for First in Firsts:
309
- First = First.rsplit('_', 1)
310
- size_of_First_clusters.append(int(First[1]))
311
- recorded_First.append(First[0])
312
- pangenome_clusters_Type[cluster] = [len(num_clustered_First[cluster]), sum(size_of_First_clusters),
313
- size_of_First_clusters, 0, 0, 0]
314
-
315
- except KeyError:
316
- ###Singleton
317
- num_pep_genomes = [len(First_genomes)]
318
- pangenome_clusters_Type[cluster] = [1, len(First_genomes), num_pep_genomes, 0, 0, 0]
319
-
320
- return pangenome_clusters_Type
321
-
103
+ def calc_only_Second_only_core(cluster, cores, Second_num): # only count the true storf onlies
104
+ try:
105
+ cores['only_Second_genera_' + str(Second_num)].append(cluster)
106
+ except:
107
+ cores['only_Second_genera_>'].append(cluster)
322
108
 
323
109
 
324
- #@profile
325
- def combined_clustering_CDHIT(options, genome_dict):
326
- unique_genomes = []
327
- Second_in = open(options.reclustered, 'r')
328
- combined_pangenome_clusters_First = OrderedDict()
329
- combined_pangenome_clusters_First_sequences = OrderedDict()
330
- combined_pangenome_clusters_Second = OrderedDict()
331
- combined_pangenome_clusters_Second_sequences = OrderedDict()
332
- combined_pangenome_clusters_First_Second_clustered = OrderedDict()
333
-
334
- not_Second_only_cluster_ids = []
335
- already_seen_PEP = []
336
- Combined_clusters = OrderedDict()
337
- Combined_reps = OrderedDict()
338
- first = True
339
- for line in Second_in:
340
- if line.startswith('>'):
341
- if first == False:
342
- cluster_size = len(Combined_clusters[cluster_id])
343
- Combined_reps.update({rep: cluster_size})
344
- for pep in combined_pangenome_clusters_First_sequences[cluster_id]:
345
- if pep != []:
346
- if pep in already_seen_PEP:
347
- continue
348
- else:
349
- already_seen_PEP.append(pep)
350
- if len(combined_pangenome_clusters_Second_sequences[cluster_id]) > 0 and len(combined_pangenome_clusters_First_sequences[cluster_id]) > 0:
351
- if len(combined_pangenome_clusters_First_sequences[cluster_id]) > 1: # If we have clustered >1 PEP family, we need to record 1 as key and all others are val
352
- all_but_first = combined_pangenome_clusters_First_sequences[cluster_id][1:]
353
- storfs_clustered = combined_pangenome_clusters_Second_sequences[cluster_id]
354
- VALUE = all_but_first + storfs_clustered
355
- else:
356
- VALUE = combined_pangenome_clusters_Second_sequences[cluster_id]
357
- KEY = combined_pangenome_clusters_First_sequences[cluster_id][0]
358
- combined_pangenome_clusters_First_Second_clustered.update({KEY: VALUE})
359
- cluster_id = line.strip('>')
360
- cluster_id = cluster_id.strip('\n')
361
- cluster_id = cluster_id.split(' ')[1]
362
- Combined_clusters.update({cluster_id: []})
363
- combined_pangenome_clusters_First.update({cluster_id: []})
364
- combined_pangenome_clusters_First_sequences.update({cluster_id: []})
365
- combined_pangenome_clusters_Second.update({cluster_id: []})
366
- combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
367
-
368
- first = False
369
- else:
370
- clustered = line.split('\t')[1]
371
- clustered = clustered.split('>')[1]
372
- clustered = clustered.split('...')[0]
373
- genome = clustered.split('|')[0]
374
- genome_dict[genome] += 1
375
- if '*' in line:
376
- rep = clustered
377
- Combined_reps.update({rep: 0})
378
- if first == False:
379
- Combined_clusters[cluster_id].append(clustered)
380
- clustered_genome = clustered.split('|')[0]
381
- if options.sequence_tag in line:
382
- if clustered_genome not in combined_pangenome_clusters_Second[cluster_id]:
383
- combined_pangenome_clusters_Second[cluster_id].append(clustered_genome)
384
- combined_pangenome_clusters_Second_sequences[cluster_id].append(clustered)
385
- else:
386
- if cluster_id not in not_Second_only_cluster_ids:
387
- not_Second_only_cluster_ids.append(cluster_id) # Tell us which StORF_Reporter clustered are unmatched to a PEP
388
- if clustered_genome not in combined_pangenome_clusters_First[cluster_id]:
389
- combined_pangenome_clusters_First[cluster_id].append(clustered_genome)
390
- combined_pangenome_clusters_First_sequences[cluster_id].append(clustered)
391
-
392
-
393
- return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second, unique_genomes
394
-
395
- def combined_clustering_Edge_List(options, genome_dict):
396
- if options.cluster_format == 'TSV':
397
- separator = '\t'
398
- elif options.cluster_format == 'CSV':
399
- separator = ','
400
- unique_genomes = []
401
- cluster_id = 0
402
- last_rep = ''
403
- Second_in = open(options.reclustered, 'r')
404
- combined_pangenome_clusters_First = OrderedDict()
405
- combined_pangenome_clusters_First_sequences = OrderedDict()
406
- combined_pangenome_clusters_Second = OrderedDict()
407
- combined_pangenome_clusters_Second_sequences = OrderedDict()
408
- combined_pangenome_clusters_First_Second_clustered = OrderedDict()
409
-
410
- not_Second_only_cluster_ids = []
411
- already_seen_PEP = []
412
- Combined_clusters = OrderedDict()
413
- Combined_reps = OrderedDict()
414
- first = True
415
- for line in Second_in:
416
- rep, child = line.strip().split(separator)
417
- child_genome = child.split('|')[0] # Extracting the genome identifier from the child sequence
418
-
419
- if first == True:
420
- Combined_clusters.update({cluster_id: []})
421
- combined_pangenome_clusters_First.update({cluster_id: []})
422
- combined_pangenome_clusters_First_sequences.update({cluster_id: []})
423
- combined_pangenome_clusters_Second.update({cluster_id: []})
424
- combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
425
- Combined_reps.update({rep: 0})
426
- first = False
427
-
428
- if first == False:
429
- if rep != last_rep and last_rep != '':
430
- cluster_size = len(Combined_clusters[cluster_id])
431
- Combined_reps.update({rep: cluster_size})
432
- for pep in combined_pangenome_clusters_First_sequences[cluster_id]:
433
- if pep != []:
434
- if pep in already_seen_PEP:
435
- continue
436
- else:
437
- already_seen_PEP.append(pep)
438
- if len(combined_pangenome_clusters_Second_sequences[cluster_id]) > 0 and len(combined_pangenome_clusters_First_sequences[cluster_id]) > 0:
439
- if len(combined_pangenome_clusters_First_sequences[cluster_id]) > 1: # If we have clustered >1 PEP family, we need to record 1 as key and all others are val
440
- all_but_first = combined_pangenome_clusters_First_sequences[cluster_id][1:]
441
- storfs_clustered = combined_pangenome_clusters_Second_sequences[cluster_id]
442
- VALUE = all_but_first + storfs_clustered
443
- else:
444
- VALUE = combined_pangenome_clusters_Second_sequences[cluster_id]
445
- KEY = combined_pangenome_clusters_First_sequences[cluster_id][0]
446
- combined_pangenome_clusters_First_Second_clustered.update({KEY: VALUE})
447
-
448
- cluster_id += 1
449
- Combined_clusters.update({cluster_id: []})
450
- combined_pangenome_clusters_First.update({cluster_id: []})
451
- combined_pangenome_clusters_First_sequences.update({cluster_id: []})
452
- combined_pangenome_clusters_Second.update({cluster_id: []})
453
- combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
454
- Combined_reps.update({rep: 0})
455
-
456
-
457
- Combined_clusters[cluster_id].append(child)
458
- if options.sequence_tag in line:
459
- if child_genome not in combined_pangenome_clusters_Second[cluster_id]:
460
- combined_pangenome_clusters_Second[cluster_id].append(child_genome)
461
- combined_pangenome_clusters_Second_sequences[cluster_id].append(child)
462
- else:
463
- if cluster_id not in not_Second_only_cluster_ids:
464
- not_Second_only_cluster_ids.append(cluster_id) # Tell us which StORF_Reporter clustered are unmatched to a PEP
465
- if child_genome not in combined_pangenome_clusters_First[cluster_id]:
466
- combined_pangenome_clusters_First[cluster_id].append(child_genome)
467
- combined_pangenome_clusters_First_sequences[cluster_id].append(child)
468
-
469
- last_rep = rep
470
-
471
- return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second, unique_genomes
472
-
473
-
474
- def cluster_EdgeList(options):
475
- if options.cluster_format == 'TSV':
476
- separator = '\t'
477
- elif options.cluster_format == 'CSV':
478
- separator = ','
479
- cluster_id = 0
480
- last_rep = ''
481
- first = True
482
- First_in = open(options.clusters, 'r')
483
- pangenome_clusters_First = OrderedDict()
484
- pangenome_clusters_First_sequences = OrderedDict()
485
- genome_dict = defaultdict(int)
486
- reps = OrderedDict()
487
- for line in First_in:
488
- rep, child = line.strip().split(separator)
489
- child_genome = child.split('|')[0] # Extracting the genome identifier from the child sequence
490
- # Counting occurrences of genomes
491
- genome_dict[child_genome] += 1
492
- if first == True:
493
- pangenome_clusters_First[0] = []
494
- pangenome_clusters_First_sequences[0] = []
495
- first = False
496
-
497
- if rep != last_rep and last_rep != '':
498
- cluster_id +=1
499
- pangenome_clusters_First[cluster_id] = []
500
- pangenome_clusters_First_sequences[cluster_id] = []
501
- cluster_size = len(pangenome_clusters_First_sequences[cluster_id-1])
502
- reps.update({last_rep: [cluster_size, len(pangenome_clusters_First[cluster_id-1])]})
503
- pangenome_clusters_First[cluster_id] = []
504
- pangenome_clusters_First_sequences[cluster_id] = []
505
- if child_genome not in pangenome_clusters_First[cluster_id]:
506
- pangenome_clusters_First[cluster_id].append(child_genome)
507
-
508
- pangenome_clusters_First_sequences[cluster_id].append(child)
509
- last_rep = rep
510
- cluster_size = len(pangenome_clusters_First_sequences[cluster_id])
511
- reps.update({rep: [cluster_size, len(pangenome_clusters_First[cluster_id])]})
512
-
513
-
514
- return genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps
515
-
516
-
517
-
518
- def cluster_CDHIT(options):
519
- First_in = open(options.clusters, 'r')
520
- clusters = OrderedDict()
521
- pangenome_clusters_First = OrderedDict()
522
- pangenome_clusters_First_sequences = OrderedDict()
523
- first = True
524
- genome_dict = defaultdict(int)
525
- reps = OrderedDict()
526
- ## Load in all data for easier reuse later
527
- for line in First_in:
528
- if line.startswith('>'):
529
- if first == False:
530
- cluster_size = len(clusters[cluster_id])
531
- reps.update({rep: [cluster_size, len(pangenome_clusters_First[cluster_id])]})
532
- cluster_id = line.strip('>')
533
- cluster_id = cluster_id.strip('\n')
534
- cluster_id = cluster_id.split(' ')[1]
535
- clusters.update({cluster_id: []})
536
- pangenome_clusters_First.update({cluster_id: []})
537
- pangenome_clusters_First_sequences.update({cluster_id: []})
538
-
539
- first = False
540
- else:
541
- clustered = line.split('\t')[1]
542
- clustered = clustered.split('>')[1]
543
- clustered = clustered.split('...')[0]
544
- genome = clustered.split('|')[0]
545
- genome_dict[genome] += 1
546
- if '*' in line:
547
- rep = clustered
548
- reps.update({rep: [0, 0]})
549
- if first == False:
550
- clusters[cluster_id].append(clustered)
551
- clustered_genome = clustered.split('|')[0]
552
- if clustered_genome not in pangenome_clusters_First[cluster_id]:
553
- pangenome_clusters_First[cluster_id].append(clustered_genome)
554
- pangenome_clusters_First_sequences[cluster_id].append(clustered)
555
- return genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps
556
110
 
557
111
  #@profile
558
112
  def cluster(options):
559
113
 
560
114
  if options.cluster_format == 'CD-HIT':
561
- genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options)
562
- elif options.cluster_format in ['TSV','CSV']:
563
- genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_EdgeList(options)
115
+ genus_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options, '_')
116
+ elif 'TSV' in options.cluster_format or 'CSV' in options.cluster_format:
117
+ genus_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_EdgeList(options, '_')
564
118
 
565
- ######################################
566
- cores, groups = get_cores(options, genome_dict)
119
+ ###
120
+ cores, groups = get_cores(options)
567
121
  ###
568
122
 
569
123
  if options.reclustered != None:
570
124
  if options.cluster_format == 'CD-HIT':
571
- combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second,\
572
- unique_genomes = combined_clustering_CDHIT(options, genome_dict)
573
- if options.cluster_format == 'TSV':
574
- combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second,\
575
- unique_genomes = combined_clustering_Edge_List(options, genome_dict)
576
- pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered)
125
+ combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences = combined_clustering_CDHIT(options, genus_dict, '_')
126
+ elif 'TSV' in options.cluster_format or 'CSV' in options.cluster_format:
127
+ combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences = combined_clustering_Edge_List(options, '_')
128
+ pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered, pangenome_clusters_First_genomes, '_')
577
129
  else:
578
- pangenome_clusters_Type = single_clustering_counting(options, pangenome_clusters_First, reps)
130
+ pangenome_clusters_Type = single_clustering_counting(pangenome_clusters_First, reps)
579
131
 
580
132
 
581
133
 
582
- Number_Of_StORF_Extending_But_Same_Genomes = 0
134
+ Number_Of_Second_Extending_But_Same_Genomes = 0
583
135
 
584
136
  sorted_first_keys = sort_keys_by_values(pangenome_clusters_First, pangenome_clusters_First_sequences)
585
137
  pangenome_clusters_First_sorted = reorder_dict_by_keys(pangenome_clusters_First, sorted_first_keys)
@@ -587,23 +139,28 @@ def cluster(options):
587
139
  pangenome_clusters_Type_sorted = reorder_dict_by_keys(pangenome_clusters_Type, sorted_first_keys)
588
140
 
589
141
  print("Calculating Groups")
142
+ seen_groupings = []
590
143
  for cluster, numbers in pangenome_clusters_Type_sorted.items():
591
144
  ############################### Calculate First only
592
- #if numbers[0] == 1 and numbers[1] >=2:
593
- calc_First_only_core(cluster, numbers[1],groups,cores)
594
-
595
- # elif numbers[0] >1 and numbers[1] >=2:
596
- # calc_First_only_core(cluster, numbers[2][0],groups,cores)
597
-
145
+ cluster = str(cluster)
146
+ for grouping in numbers[2]: #!!# Could do with a more elegant solution
147
+ current_cluster = grouping[0].split(':')[0]
148
+ if current_cluster not in seen_groupings:
149
+ seen_groupings.append(current_cluster)
150
+ current_cluster_size = grouping[0].split(':')[1]
151
+ calc_First_only_core(current_cluster, current_cluster_size, cores)
152
+ ############################# Calculate First and Reclustered-Second
153
+ if numbers[0] == 1 and numbers[3] >= 1: # If Seconds did not combine First reps
154
+ calc_single_First_extended_Second_only_core(cluster, numbers[1], cores, numbers[3])
155
+ elif numbers[0] > 1 and numbers[3] >= 1: # If unique Seconds combined multiple Firsts
156
+ calc_multi_First_extended_Second_only_core(cluster, numbers[1], cores, numbers[3])
157
+ elif numbers[4] >= 1:
158
+ Number_Of_Second_Extending_But_Same_Genomes += 1
159
+ else:
160
+ if options.verbose == True:
161
+ print("First cluster " + current_cluster + " already processed - This is likely because it was clustered with another First representative.")
598
162
 
599
163
  if options.reclustered != None:
600
- ############################# Calculate First and Reclustered-Second
601
- if numbers[0] == 1 and numbers[3] >= 1: # If Seconds did not combine First reps
602
- calc_single_First_extended_Second_only_core(numbers[1], groups, cores, numbers[3])
603
- elif numbers[0] > 1 and numbers[3] >= 1: # If unique Secondss combined multiple Firsts
604
- calc_multi_First_extended_Second_only_core(numbers[1], groups, cores, numbers[3])
605
- elif numbers[4] >= 1:
606
- Number_Of_StORF_Extending_But_Same_Genomes += 1
607
164
  combined_pangenome_clusters_ONLY_Second_Type = defaultdict(list)
608
165
  combined_pangenome_clusters_Second_Type = defaultdict(list)
609
166
  for cluster, genomes in combined_pangenome_clusters_Second.items():
@@ -612,47 +169,73 @@ def cluster(options):
612
169
  else:
613
170
  combined_pangenome_clusters_ONLY_Second_Type[cluster] = [cluster, len(genomes)]
614
171
  for cluster, data in combined_pangenome_clusters_Second_Type.items():
615
- calc_Second_only_core(groups, cores, data[1])
172
+ if data[1] >= 1:
173
+ calc_Second_only_core(cluster, cores, data[1])
616
174
  for cluster, data in combined_pangenome_clusters_ONLY_Second_Type.items():
617
- if data[1] >= 2:
618
- calc_only_Second_only_core(groups, cores, data[1])
175
+ if data[1] >= 1:
176
+ calc_only_Second_only_core(cluster, cores, data[1])
619
177
  ###########################
620
- key_order = ['first_core_', 'extended_core_', 'combined_core_', 'second_core_','only_second_core_']
621
- print("Gene Groups:")
622
- for key_prefix in key_order:
623
- for key, value in cores.items():
624
- if key.startswith(key_prefix):
625
- print(f"{key}: {len(value)}")
626
- print("Total Number of Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
178
+ ### Output
179
+ output_path = os.path.abspath(options.output_dir)
180
+ if not os.path.exists(output_path):
181
+ os.makedirs(output_path)
182
+ stats_out = os.path.join(output_path,'summary_statistics.txt')
183
+ key_order = list(cores.keys())
184
+ with open(stats_out,'w') as outfile:
185
+ print("Genus Groups:")
186
+ outfile.write("Genus Groups:\n")
187
+ for key in key_order:
188
+ print(key+':\t'+str(len(cores[key])))
189
+ outfile.write(key + ':\t' + str(len(cores[key]))+'\n')
190
+ print("Total Number of First Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
191
+ outfile.write("Total Number of Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
192
+ if options.reclustered!= None:
193
+ print("Total Number of Second Gene Groups (Including Singletons): " + str(
194
+ len(combined_pangenome_clusters_Second_sequences)))
195
+ print("Total Number of First Gene Groups That Had Additional Second Sequences But Not New Genomes: " + str(
196
+ Number_Of_Second_Extending_But_Same_Genomes))
197
+ outfile.write("\nTotal Number of Second Gene Groups (Including Singletons): " + str(
198
+ len(combined_pangenome_clusters_Second_sequences)))
199
+ outfile.write("\nTotal Number of First Gene Groups That Had Additional Second Sequences But Not New Genomes: " + str(
200
+ Number_Of_Second_Extending_But_Same_Genomes))
627
201
 
628
202
  if options.gene_presence_absence_out != None:
629
- gene_presence_absence_output(options,genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted)
630
-
631
- if options.write_families != None and options.fasta != None:
632
- sequences = read_fasta(options.fasta)
633
- input_dir = os.path.dirname(os.path.abspath(options.clusters))
634
- output_dir = os.path.join(input_dir, 'Gene_Families_Output')
635
-
636
- # Create output directory if it doesn't exist
637
- if not os.path.exists(output_dir):
638
- os.makedirs(output_dir)
639
- for key_prefix in key_order:
640
- for key, values in cores.items():
641
- if any(part in options.write_families.split(',') for part in key.split('_')):
642
- if key.startswith(key_prefix):
643
- for value in values:
644
- output_filename = f"{key}_{value}.fasta"
645
- sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
646
- # Write sequences to output file that are in the sequences dictionary
647
- with open(os.path.join(output_dir, output_filename), 'w') as outfile:
648
- for header in sequences_to_write:
649
- if header in sequences:
650
- outfile.write(f">{header}\n")
651
- wrapped_sequence = wrap_sequence(sequences[header])
652
- outfile.write(f"{wrapped_sequence}\n")
653
-
654
- if options.con_core != None and options.fasta != None and options.write_families != None:
655
- process_gene_families(options, os.path.join(input_dir, 'Gene_Families_Output'), 'concatonated_genes_aligned.fasta')
203
+ gene_presence_absence_output(options,genus_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted)
204
+
205
+ if options.run_mode == 'Full':
206
+ if options.reclustered == None:
207
+ combined_pangenome_clusters_Second_sequences = None
208
+ if options.write_groups != None:
209
+ print("Outputting gene group FASTA files")
210
+ sequences = read_fasta(options.fasta)
211
+ #output_dir = os.path.dirname(os.path.abspath(options.output_dir))
212
+ output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
213
+ write_groups(options,output_dir, key_order, cores, sequences,
214
+ pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
215
+
216
+ elif options.run_mode == 'Partial':
217
+ if options.reclustered == None:
218
+ combined_pangenome_clusters_Second_sequences = None
219
+ if options.write_groups != None and options.fasta != None:
220
+ print("Outputting gene group FASTA files")
221
+ sequences = read_fasta(options.fasta)
222
+ #output_dir = os.path.dirname(os.path.abspath(options.output_dir))
223
+ output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
224
+ write_groups(options,output_dir, key_order, cores, sequences,
225
+ pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
226
+
227
+
228
+ # if options.write_groups != None and options.fasta != None:
229
+ # sequences = read_fasta(options.fasta)
230
+ # output_dir = os.path.join(output_path, 'Gene_Families_Output')
231
+ #
232
+ # write_groups(options,output_dir, key_order, cores, sequences,
233
+ # pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
234
+
235
+
236
+ #!!# - Currently only align in Species Mode
237
+ #if options.align_core != None and options.fasta != None and options.write_groups != None:
238
+ # process_gene_families(options, os.path.join(output_path, 'Gene_Families_Output'), 'concatenated_genes_aligned.fasta')
656
239
 
657
240
 
658
241