PyamilySeq 0.5.2__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,109 +1,23 @@
1
1
  #from line_profiler_pycharm import profile
2
2
 
3
- from collections import OrderedDict,defaultdict
4
- import copy
5
3
  import math
6
- import sys
7
- from tempfile import NamedTemporaryFile
8
-
9
-
10
4
 
11
5
  try:
12
6
  from .Constants import *
7
+ from .clusterings import *
13
8
  from .utils import *
14
9
  except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
15
10
  from Constants import *
11
+ from clusterings import *
16
12
  from utils import *
17
13
 
18
14
 
19
- def custom_sort_key(k, dict1, dict2):
20
- return (len(dict1[k]), len(dict2[k]))
21
-
22
- def sort_keys_by_values(dict1, dict2):
23
- sorted_keys = sorted(dict1.keys(), key=lambda k: custom_sort_key(k, dict1, dict2), reverse=True)
24
- return sorted_keys
25
-
26
- def select_longest_gene(sequences):
27
- """Select the longest sequence for each genome."""
28
- longest_sequences = {}
29
- for seq_id, sequence in sequences.items():
30
- genome = seq_id.split('|')[0] # Assuming genome name can be derived from the sequence ID
31
- if genome not in longest_sequences or len(sequence) > len(longest_sequences[genome][1]):
32
- longest_sequences[genome] = (seq_id, sequence)
33
- return longest_sequences
34
-
35
-
36
- def run_mafft_on_sequences(options, sequences, output_file):
37
- """Run mafft on the given sequences and write to output file."""
38
- # Create a temporary input file for mafft
39
- with NamedTemporaryFile('w', delete=False) as temp_input_file:
40
- for header, sequence in sequences.items():
41
- temp_input_file.write(f">{header}\n{sequence}\n")
42
- temp_input_file_path = temp_input_file.name
43
-
44
- # Run mafft
45
- try:
46
- with open(output_file, 'w') as output_f:
47
- if options.verbose == 'True':
48
- subprocess.run(
49
- ['mafft', '--auto', temp_input_file_path],
50
- stdout=output_f,
51
- stderr=sys.stderr,
52
- check=True
53
- )
54
- else:
55
- subprocess.run(
56
- ['mafft', '--auto', temp_input_file_path],
57
- stdout=output_f,
58
- stderr=subprocess.DEVNULL, # Suppress stderr
59
- check=True
60
- )
61
- finally:
62
- os.remove(temp_input_file_path) # Clean up the temporary file
63
-
64
-
65
- def process_gene_families(options, directory, output_file):
66
- """Process each gene family file to select the longest sequence per genome and concatenate aligned sequences."""
67
- concatenated_sequences = {}
68
- output_file = directory.replace('Gene_Families_Output',output_file)
69
-
70
- # Iterate over each gene family file
71
- for gene_file in os.listdir(directory):
72
- if gene_file.endswith('.fasta'):
73
- gene_path = os.path.join(directory, gene_file)
74
-
75
- # Read sequences from the gene family file
76
- sequences = read_fasta(gene_path)
77
-
78
- # Select the longest sequence for each genome
79
- longest_sequences = select_longest_gene(sequences)
80
-
81
- # Run mafft on the longest sequences
82
- aligned_file = f"{gene_file}_aligned.fasta"
83
- run_mafft_on_sequences(options, {seq_id: seq for seq_id, seq in longest_sequences.values()}, aligned_file)
84
-
85
- # Read aligned sequences and concatenate them
86
- aligned_sequences = read_fasta(aligned_file)
87
- for genome, aligned_seq in aligned_sequences.items():
88
- genome_name = genome.split('|')[0]
89
- if genome_name not in concatenated_sequences:
90
- concatenated_sequences[genome_name] = ""
91
- concatenated_sequences[genome_name] += aligned_seq
92
-
93
- # Clean up aligned file
94
- os.remove(aligned_file)
95
-
96
- # Write the concatenated sequences to the output file
97
- with open(output_file, 'w') as out:
98
- for genome, sequence in concatenated_sequences.items():
99
- out.write(f">{genome}\n")
100
- wrapped_sequence = wrap_sequence(sequence, 60)
101
- out.write(f"{wrapped_sequence}\n")
102
-
103
15
  def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
104
16
  print("Outputting gene_presence_absence file")
105
- in_name = options.clusters.split('.')[0]
106
- gpa_outfile = open(in_name+'_gene_presence_absence.csv','w')
17
+ output_dir = os.path.abspath(options.output_dir)
18
+ in_name = options.clusters.split('.')[0].split('/')[-1]
19
+ gpa_outfile = os.path.join(output_dir, in_name)
20
+ gpa_outfile = open(gpa_outfile+'_gene_presence_absence.csv','w')
107
21
  gpa_outfile.write('"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","'
108
22
  '"Accessory Fragment","Accessory Order with Fragment","QC","Min group size nuc","Max group size nuc","Avg group size nuc","')
109
23
  gpa_outfile.write('","'.join(genome_dict.keys()))
@@ -113,14 +27,17 @@ def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_
113
27
  gpa_outfile.write('"group_'+str(cluster)+'","","'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
114
28
  '","","","","","","","","",""')
115
29
 
116
- full_out = ''
30
+
117
31
  for genome in genome_dict.keys():
32
+ full_out = ''
118
33
  tmp_list = []
119
34
  for value in sequences:
120
35
  if value.split('|')[0] == genome:
121
36
  tmp_list.append(value)
122
37
  if tmp_list:
123
38
  full_out += ',"'+''.join(tmp_list)+'"'
39
+ else:
40
+ full_out = ',""'
124
41
  gpa_outfile.write(full_out)
125
42
  gpa_outfile.write('\n')
126
43
 
@@ -137,31 +54,7 @@ def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_
137
54
  # edge_list_outfile.write(line + '\n')
138
55
 
139
56
 
140
- def wrap_sequence(sequence, width=60):
141
- wrapped_sequence = []
142
- for i in range(0, len(sequence), width):
143
- wrapped_sequence.append(sequence[i:i + width])
144
- return "\n".join(wrapped_sequence)
145
-
146
-
147
- def read_fasta(fasta_file):
148
- sequences = {}
149
- current_sequence = None
150
- with open(fasta_file, 'r') as file:
151
- for line in file:
152
- line = line.strip()
153
- if not line:
154
- continue
155
- if line.startswith('>'):
156
- current_sequence = line[1:]
157
- sequences[current_sequence] = ''
158
- else:
159
- sequences[current_sequence] += line
160
- return sequences
161
-
162
57
 
163
- def reorder_dict_by_keys(original_dict, sorted_keys):
164
- return {k: original_dict[k] for k in sorted_keys}
165
58
 
166
59
  def get_cores(options,genome_dict):
167
60
  ##Calculate core groups
@@ -172,409 +65,93 @@ def get_cores(options,genome_dict):
172
65
  for group in options.core_groups.split(','):
173
66
  calculated_floor = math.floor(int(group) / 100 * len(genome_dict))
174
67
  if first == False:
175
- # Ensure no overlap
176
- # if calculated_floor <= prev_top:
177
- # calculated_floor = prev_top - 1
178
-
179
68
  groups[group] = (calculated_floor,prev_top)
180
69
  else:
181
70
  groups[group] = (calculated_floor, prev_top)
182
71
  first = False
183
72
  prev_top = calculated_floor
184
- first_core_group = 'first_core_' + group
73
+ first_core_group = 'First_core_' + group
185
74
  cores[first_core_group] = []
186
75
  if options.reclustered != None:
187
76
  extended_core_group = 'extended_core_' + group
188
77
  cores[extended_core_group] = []
189
78
  combined_core_group = 'combined_core_' + group
190
79
  cores[combined_core_group] = []
191
- second_core_group = 'second_core_' + group
80
+ second_core_group = 'Second_core_' + group
192
81
  cores[second_core_group] = []
193
- only_second_core_group = 'only_second_core_' + group
82
+ only_second_core_group = 'only_Second_core_' + group
194
83
  cores[only_second_core_group] = []
195
84
  return cores, groups
196
85
 
197
86
  #@profile
198
- def calc_First_only_core(cluster, pep_num, groups, cores):
87
+ def calc_First_only_core(cluster, First_num, groups, cores):
199
88
  groups_as_list = list(groups.values())
200
- for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= pep_num <= fir):
89
+ for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= int(First_num) <= fir):
201
90
  res = idx
202
91
  family_group = list(groups)[res]
203
- cores['first_core_'+family_group].append(cluster)
92
+ cores['First_core_'+family_group].append(cluster)
204
93
 
205
94
  #@profile
206
- def calc_single_First_extended_Second_only_core(pep_num, groups, cores, second_num): # Count gene families extended with StORFs
95
+ def calc_single_First_extended_Second_only_core(cluster, First_num, groups, cores, Second_num): # Count gene families extended with StORFs
207
96
  groups_as_list = list(groups.values())
208
- for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= pep_num+second_num <= fir):
97
+ for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= First_num+Second_num <= fir):
209
98
  res = idx
99
+
210
100
  family_group = list(groups)[res]
211
- cores['extended_core_' + family_group].append(pep_num)
101
+ cores['extended_core_' + family_group].append(cluster)
212
102
 
213
103
 
214
104
  #@profile
215
- def calc_multi_First_extended_Second_only_core(pep_num, groups, cores, second_num): # Count seperately those gene families extended with StORF_Reporter but combined >1 PEP
105
+ def calc_multi_First_extended_Second_only_core(cluster, First_num, groups, cores, Second_num): # Count seperately those gene families extended with StORF_Reporter but combined >1 PEP
216
106
  groups_as_list = list(groups.values())
217
- for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= pep_num+second_num <= fir):
218
- res = idx
107
+ # Looping through the list to find the matching condition
108
+ for idx, (sec, fir) in enumerate(groups_as_list):
109
+ if sec <= First_num + Second_num <= fir:
110
+ res = idx
111
+ break
219
112
  family_group = list(groups)[res]
220
- cores['combined_core_' + family_group] += 1
113
+ cores['combined_core_' + family_group].append(cluster)
221
114
 
222
115
 
223
116
  #@profile
224
- def calc_Second_only_core(groups, cores, second_num):
117
+ def calc_Second_only_core(cluster, Second_num, groups, cores):
225
118
  groups_as_list = list(groups.values())
226
- for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= second_num <= fir):
119
+ for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= Second_num <= fir):
227
120
  res = idx
228
121
  family_group = list(groups)[res]
229
- cores['second_core_' + family_group] += 1
122
+ cores['Second_core_' + family_group].append(cluster)
230
123
 
231
124
  #@profile
232
- def calc_only_Second_only_core(groups, cores, second_num): # only count the true storf onlies
125
+ def calc_only_Second_only_core(cluster, Second_num, groups, cores): # only count the true storf onlies
233
126
  groups_as_list = list(groups.values())
234
- for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= second_num <= fir):
127
+ for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= Second_num <= fir):
235
128
  res = idx
236
129
  family_group = list(groups)[res]
237
- cores['only_second_core_' + family_group] += 1
238
-
239
-
240
-
241
-
242
-
243
- #@profile
244
- def combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered):
245
- num_clustered_First = defaultdict(list)
246
- pangenome_clusters_Type = copy.deepcopy(pangenome_clusters_First)
247
- list_of_reps = list(reps.keys())
248
- for cluster, pep_genomes in pangenome_clusters_First.items():
249
- rep = list_of_reps[int(cluster)] # get the rep of the current pep cluster
250
- Com_PEP_Genomes = 0
251
- Seconds = 0
252
- seen_Seconds = []
253
- added_Second_genomes = 0
254
- try: # get the cluster from the storf clusters which contains this rep
255
- clustered_combined = combined_pangenome_clusters_First_Second_clustered[rep] # Not true clusters - I put a PEP as key myself
256
- seen_clust_Genomes = []
257
- num_clustered_First[cluster].append(rep + '_' + str(len(pep_genomes)))
258
- for clust in clustered_combined:
259
- if options.sequence_tag not in clust: # Not good enough at the moment
260
- clust_Genome = clust.split('|')[0]
261
- if clust_Genome not in seen_clust_Genomes:
262
- seen_clust_Genomes.append(clust_Genome)
263
- if clust_Genome not in pep_genomes:
264
- Com_PEP_Genomes += 1
265
- num_clustered_First[cluster].append(clust + '_' + str(reps[clust][1]))
266
- elif options.sequence_tag in clust:
267
- Seconds += 1
268
- clust_Genome = clust.split('|')[0]
269
- if clust_Genome not in seen_Seconds:
270
- seen_Seconds.append(clust_Genome)
271
- if clust_Genome not in seen_clust_Genomes:
272
- seen_clust_Genomes.append(clust_Genome)
273
- if clust_Genome not in pep_genomes:
274
- added_Second_genomes += 1
275
- else:
276
- sys.exit("Error: looking for sequence_tag")
277
-
278
- size_of_pep_clusters = []
279
- peps = num_clustered_First[cluster]
280
- for pep in peps:
281
- pep = pep.rsplit('_', 1)
282
- size_of_pep_clusters.append(int(pep[1]))
283
- pangenome_clusters_Type[cluster] = [len(num_clustered_First[cluster]), sum(size_of_pep_clusters),
284
- size_of_pep_clusters, added_Second_genomes, Seconds, len(seen_Seconds)]
285
-
286
- except KeyError:
287
- ###Singleton
288
- num_pep_genomes = [len(pep_genomes)]
289
- pangenome_clusters_Type[cluster] = [1, len(pep_genomes), num_pep_genomes, added_Second_genomes, Seconds,
290
- len(seen_Seconds)]
291
-
292
- return pangenome_clusters_Type
293
-
294
- #@profile
295
- def single_clustering_counting(options, pangenome_clusters_First, reps):
296
- num_clustered_First = defaultdict(list)
297
- recorded_First = []
298
- pangenome_clusters_Type = copy.deepcopy(pangenome_clusters_First)
299
- list_of_reps = list(reps.keys())
300
- for cluster, First_genomes in pangenome_clusters_First.items():
301
- rep = list_of_reps[int(cluster)] # get the rep of the current pep cluster
302
-
303
- try: # get the cluster from the storf clusters which contains this rep
304
- num_clustered_First[cluster].append(rep + '_' + str(len(First_genomes)))
305
- size_of_First_clusters = []
306
- Firsts = num_clustered_First[cluster]
307
- for First in Firsts:
308
- First = First.rsplit('_', 1)
309
- size_of_First_clusters.append(int(First[1]))
310
- recorded_First.append(First[0])
311
- pangenome_clusters_Type[cluster] = [len(num_clustered_First[cluster]), sum(size_of_First_clusters),
312
- size_of_First_clusters, 0, 0, 0]
313
-
314
- except KeyError:
315
- ###Singleton
316
- num_pep_genomes = [len(First_genomes)]
317
- pangenome_clusters_Type[cluster] = [1, len(First_genomes), num_pep_genomes, 0, 0, 0]
318
-
319
- return pangenome_clusters_Type
320
-
321
-
322
-
323
- #@profile
324
- def combined_clustering_CDHIT(options, genome_dict):
325
- unique_genomes = []
326
- Second_in = open(options.reclustered, 'r')
327
- combined_pangenome_clusters_First = OrderedDict()
328
- combined_pangenome_clusters_First_sequences = OrderedDict()
329
- combined_pangenome_clusters_Second = OrderedDict()
330
- combined_pangenome_clusters_Second_sequences = OrderedDict()
331
- combined_pangenome_clusters_First_Second_clustered = OrderedDict()
332
-
333
- not_Second_only_cluster_ids = []
334
- already_seen_PEP = []
335
- Combined_clusters = OrderedDict()
336
- Combined_reps = OrderedDict()
337
- first = True
338
- for line in Second_in:
339
- if line.startswith('>'):
340
- if first == False:
341
- cluster_size = len(Combined_clusters[cluster_id])
342
- Combined_reps.update({rep: cluster_size})
343
- for pep in combined_pangenome_clusters_First_sequences[cluster_id]:
344
- if pep != []:
345
- if pep in already_seen_PEP:
346
- continue
347
- else:
348
- already_seen_PEP.append(pep)
349
- if len(combined_pangenome_clusters_Second_sequences[cluster_id]) > 0 and len(combined_pangenome_clusters_First_sequences[cluster_id]) > 0:
350
- if len(combined_pangenome_clusters_First_sequences[cluster_id]) > 1: # If we have clustered >1 PEP family, we need to record 1 as key and all others are val
351
- all_but_first = combined_pangenome_clusters_First_sequences[cluster_id][1:]
352
- storfs_clustered = combined_pangenome_clusters_Second_sequences[cluster_id]
353
- VALUE = all_but_first + storfs_clustered
354
- else:
355
- VALUE = combined_pangenome_clusters_Second_sequences[cluster_id]
356
- KEY = combined_pangenome_clusters_First_sequences[cluster_id][0]
357
- combined_pangenome_clusters_First_Second_clustered.update({KEY: VALUE})
358
- cluster_id = line.strip('>')
359
- cluster_id = cluster_id.strip('\n')
360
- cluster_id = cluster_id.split(' ')[1]
361
- Combined_clusters.update({cluster_id: []})
362
- combined_pangenome_clusters_First.update({cluster_id: []})
363
- combined_pangenome_clusters_First_sequences.update({cluster_id: []})
364
- combined_pangenome_clusters_Second.update({cluster_id: []})
365
- combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
366
-
367
- first = False
368
- else:
369
- clustered = line.split('\t')[1]
370
- clustered = clustered.split('>')[1]
371
- clustered = clustered.split('...')[0]
372
- genome = clustered.split('|')[0]
373
- genome_dict[genome] += 1
374
- if '*' in line:
375
- rep = clustered
376
- Combined_reps.update({rep: 0})
377
- if first == False:
378
- Combined_clusters[cluster_id].append(clustered)
379
- clustered_genome = clustered.split('|')[0]
380
- if options.sequence_tag in line:
381
- if clustered_genome not in combined_pangenome_clusters_Second[cluster_id]:
382
- combined_pangenome_clusters_Second[cluster_id].append(clustered_genome)
383
- combined_pangenome_clusters_Second_sequences[cluster_id].append(clustered)
384
- else:
385
- if cluster_id not in not_Second_only_cluster_ids:
386
- not_Second_only_cluster_ids.append(cluster_id) # Tell us which StORF_Reporter clustered are unmatched to a PEP
387
- if clustered_genome not in combined_pangenome_clusters_First[cluster_id]:
388
- combined_pangenome_clusters_First[cluster_id].append(clustered_genome)
389
- combined_pangenome_clusters_First_sequences[cluster_id].append(clustered)
390
-
391
-
392
- return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second, unique_genomes
393
-
394
- def combined_clustering_Edge_List(options, genome_dict):
395
- if options.cluster_format == 'TSV':
396
- separator = '\t'
397
- elif options.cluster_format == 'CSV':
398
- separator = ','
399
- unique_genomes = []
400
- cluster_id = 0
401
- last_rep = ''
402
- Second_in = open(options.reclustered, 'r')
403
- combined_pangenome_clusters_First = OrderedDict()
404
- combined_pangenome_clusters_First_sequences = OrderedDict()
405
- combined_pangenome_clusters_Second = OrderedDict()
406
- combined_pangenome_clusters_Second_sequences = OrderedDict()
407
- combined_pangenome_clusters_First_Second_clustered = OrderedDict()
408
-
409
- not_Second_only_cluster_ids = []
410
- already_seen_PEP = []
411
- Combined_clusters = OrderedDict()
412
- Combined_reps = OrderedDict()
413
- first = True
414
- for line in Second_in:
415
- rep, child = line.strip().split(separator)
416
- child_genome = child.split('|')[0] # Extracting the genome identifier from the child sequence
417
-
418
- if first == True:
419
- Combined_clusters.update({cluster_id: []})
420
- combined_pangenome_clusters_First.update({cluster_id: []})
421
- combined_pangenome_clusters_First_sequences.update({cluster_id: []})
422
- combined_pangenome_clusters_Second.update({cluster_id: []})
423
- combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
424
- Combined_reps.update({rep: 0})
425
- first = False
426
-
427
- if first == False:
428
- if rep != last_rep and last_rep != '':
429
- cluster_size = len(Combined_clusters[cluster_id])
430
- Combined_reps.update({rep: cluster_size})
431
- for pep in combined_pangenome_clusters_First_sequences[cluster_id]:
432
- if pep != []:
433
- if pep in already_seen_PEP:
434
- continue
435
- else:
436
- already_seen_PEP.append(pep)
437
- if len(combined_pangenome_clusters_Second_sequences[cluster_id]) > 0 and len(combined_pangenome_clusters_First_sequences[cluster_id]) > 0:
438
- if len(combined_pangenome_clusters_First_sequences[cluster_id]) > 1: # If we have clustered >1 PEP family, we need to record 1 as key and all others are val
439
- all_but_first = combined_pangenome_clusters_First_sequences[cluster_id][1:]
440
- storfs_clustered = combined_pangenome_clusters_Second_sequences[cluster_id]
441
- VALUE = all_but_first + storfs_clustered
442
- else:
443
- VALUE = combined_pangenome_clusters_Second_sequences[cluster_id]
444
- KEY = combined_pangenome_clusters_First_sequences[cluster_id][0]
445
- combined_pangenome_clusters_First_Second_clustered.update({KEY: VALUE})
446
-
447
- cluster_id += 1
448
- Combined_clusters.update({cluster_id: []})
449
- combined_pangenome_clusters_First.update({cluster_id: []})
450
- combined_pangenome_clusters_First_sequences.update({cluster_id: []})
451
- combined_pangenome_clusters_Second.update({cluster_id: []})
452
- combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
453
- Combined_reps.update({rep: 0})
454
-
455
-
456
- Combined_clusters[cluster_id].append(child)
457
- if options.sequence_tag in line:
458
- if child_genome not in combined_pangenome_clusters_Second[cluster_id]:
459
- combined_pangenome_clusters_Second[cluster_id].append(child_genome)
460
- combined_pangenome_clusters_Second_sequences[cluster_id].append(child)
461
- else:
462
- if cluster_id not in not_Second_only_cluster_ids:
463
- not_Second_only_cluster_ids.append(cluster_id) # Tell us which StORF_Reporter clustered are unmatched to a PEP
464
- if child_genome not in combined_pangenome_clusters_First[cluster_id]:
465
- combined_pangenome_clusters_First[cluster_id].append(child_genome)
466
- combined_pangenome_clusters_First_sequences[cluster_id].append(child)
130
+ cores['only_Second_core_' + family_group].append(cluster)
467
131
 
468
- last_rep = rep
469
132
 
470
- return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second, unique_genomes
471
-
472
-
473
- def cluster_EdgeList(options):
474
- if options.cluster_format == 'TSV':
475
- separator = '\t'
476
- elif options.cluster_format == 'CSV':
477
- separator = ','
478
- cluster_id = 0
479
- last_rep = ''
480
- first = True
481
- First_in = open(options.clusters, 'r')
482
- pangenome_clusters_First = OrderedDict()
483
- pangenome_clusters_First_sequences = OrderedDict()
484
- genome_dict = defaultdict(int)
485
- reps = OrderedDict()
486
- for line in First_in:
487
- rep, child = line.strip().split(separator)
488
- child_genome = child.split('|')[0] # Extracting the genome identifier from the child sequence
489
- # Counting occurrences of genomes
490
- genome_dict[child_genome] += 1
491
- if first == True:
492
- pangenome_clusters_First[0] = []
493
- pangenome_clusters_First_sequences[0] = []
494
- first = False
495
-
496
- if rep != last_rep and last_rep != '':
497
- cluster_id +=1
498
- pangenome_clusters_First[cluster_id] = []
499
- pangenome_clusters_First_sequences[cluster_id] = []
500
- cluster_size = len(pangenome_clusters_First_sequences[cluster_id-1])
501
- reps.update({last_rep: [cluster_size, len(pangenome_clusters_First[cluster_id-1])]})
502
- pangenome_clusters_First[cluster_id] = []
503
- pangenome_clusters_First_sequences[cluster_id] = []
504
- if child_genome not in pangenome_clusters_First[cluster_id]:
505
- pangenome_clusters_First[cluster_id].append(child_genome)
506
-
507
- pangenome_clusters_First_sequences[cluster_id].append(child)
508
- last_rep = rep
509
- cluster_size = len(pangenome_clusters_First_sequences[cluster_id])
510
- reps.update({rep: [cluster_size, len(pangenome_clusters_First[cluster_id])]})
511
-
512
-
513
- return genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps
514
-
515
-
516
-
517
- def cluster_CDHIT(options):
518
- First_in = open(options.clusters, 'r')
519
- clusters = OrderedDict()
520
- pangenome_clusters_First = OrderedDict()
521
- pangenome_clusters_First_sequences = OrderedDict()
522
- first = True
523
- genome_dict = defaultdict(int)
524
- reps = OrderedDict()
525
- ## Load in all data for easier reuse later
526
- for line in First_in:
527
- if line.startswith('>'):
528
- if first == False:
529
- cluster_size = len(clusters[cluster_id])
530
- reps.update({rep: [cluster_size, len(pangenome_clusters_First[cluster_id])]})
531
- cluster_id = line.strip('>')
532
- cluster_id = cluster_id.strip('\n')
533
- cluster_id = cluster_id.split(' ')[1]
534
- clusters.update({cluster_id: []})
535
- pangenome_clusters_First.update({cluster_id: []})
536
- pangenome_clusters_First_sequences.update({cluster_id: []})
537
-
538
- first = False
539
- else:
540
- clustered = line.split('\t')[1]
541
- clustered = clustered.split('>')[1]
542
- clustered = clustered.split('...')[0]
543
- genome = clustered.split('|')[0]
544
- genome_dict[genome] += 1
545
- if '*' in line:
546
- rep = clustered
547
- reps.update({rep: [0, 0]})
548
- if first == False:
549
- clusters[cluster_id].append(clustered)
550
- clustered_genome = clustered.split('|')[0]
551
- if clustered_genome not in pangenome_clusters_First[cluster_id]:
552
- pangenome_clusters_First[cluster_id].append(clustered_genome)
553
- pangenome_clusters_First_sequences[cluster_id].append(clustered)
554
- return genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps
555
133
 
556
134
  #@profile
557
135
  def cluster(options):
558
136
 
559
137
  if options.cluster_format == 'CD-HIT':
560
- genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options)
561
- elif options.cluster_format in ['TSV','CSV']:
562
- genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_EdgeList(options)
138
+ genome_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options, '|')
139
+ elif 'TSV' in options.cluster_format or 'CSV' in options.cluster_format:
140
+ genome_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_EdgeList(options, '|')
563
141
 
564
- ######################################
142
+ ###
565
143
  cores, groups = get_cores(options, genome_dict)
566
144
  ###
567
145
 
568
146
  if options.reclustered != None:
569
147
  if options.cluster_format == 'CD-HIT':
570
- combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second,\
571
- unique_genomes = combined_clustering_CDHIT(options, genome_dict)
572
- if options.cluster_format == 'TSV':
573
- combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second,\
574
- unique_genomes = combined_clustering_Edge_List(options, genome_dict)
575
- pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered)
148
+ combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences = combined_clustering_CDHIT(options, genome_dict, '|')
149
+ elif 'TSV' in options.cluster_format or 'CSV' in options.cluster_format:
150
+ #Fix
151
+ combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second,combined_pangenome_clusters_Second_sequences = combined_clustering_Edge_List(options, '|')
152
+ pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered, pangenome_clusters_First_genomes, '|')
576
153
  else:
577
- pangenome_clusters_Type = single_clustering_counting(options, pangenome_clusters_First, reps)
154
+ pangenome_clusters_Type = single_clustering_counting(pangenome_clusters_First, reps)
578
155
 
579
156
 
580
157
 
@@ -586,23 +163,30 @@ def cluster(options):
586
163
  pangenome_clusters_Type_sorted = reorder_dict_by_keys(pangenome_clusters_Type, sorted_first_keys)
587
164
 
588
165
  print("Calculating Groups")
166
+ seen_groupings = []
589
167
  for cluster, numbers in pangenome_clusters_Type_sorted.items():
590
168
  ############################### Calculate First only
591
- #if numbers[0] == 1 and numbers[1] >=2:
592
- calc_First_only_core(cluster, numbers[1],groups,cores)
593
-
594
- # elif numbers[0] >1 and numbers[1] >=2:
595
- # calc_First_only_core(cluster, numbers[2][0],groups,cores)
596
-
169
+ cluster = str(cluster)
170
+ if '78' in cluster:
171
+ pass
172
+ for grouping in numbers[2]: #!!# Could do with a more elegant solution
173
+ current_cluster = grouping[0].split(':')[0]
174
+ if current_cluster not in seen_groupings:
175
+ seen_groupings.append(current_cluster)
176
+ current_cluster_size = grouping[0].split(':')[1]
177
+ calc_First_only_core(current_cluster, current_cluster_size,groups,cores)
178
+ ############################# Calculate First and Reclustered-Second
179
+ if numbers[0] == 1 and numbers[3] >= 1: # If Seconds did not combine First reps
180
+ calc_single_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
181
+ elif numbers[0] > 1 and numbers[3] >= 1: # If unique Seconds combined multiple Firsts
182
+ calc_multi_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
183
+ elif numbers[4] >= 1:
184
+ Number_Of_Second_Extending_But_Same_Genomes += 1
185
+ else:
186
+ if options.verbose == True:
187
+ print("First cluster " + current_cluster + " already processed - This is likely because it was clustered with another First representative.")
597
188
 
598
189
  if options.reclustered != None:
599
- ############################# Calculate First and Reclustered-Second
600
- if numbers[0] == 1 and numbers[3] >= 1: # If Seconds did not combine First reps
601
- calc_single_First_extended_Second_only_core(numbers[1], groups, cores, numbers[3])
602
- elif numbers[0] > 1 and numbers[3] >= 1: # If unique Secondss combined multiple Firsts
603
- calc_multi_First_extended_Second_only_core(numbers[1], groups, cores, numbers[3])
604
- elif numbers[4] >= 1:
605
- Number_Of_Second_Extending_But_Same_Genomes += 1
606
190
  combined_pangenome_clusters_ONLY_Second_Type = defaultdict(list)
607
191
  combined_pangenome_clusters_Second_Type = defaultdict(list)
608
192
  for cluster, genomes in combined_pangenome_clusters_Second.items():
@@ -611,120 +195,92 @@ def cluster(options):
611
195
  else:
612
196
  combined_pangenome_clusters_ONLY_Second_Type[cluster] = [cluster, len(genomes)]
613
197
  for cluster, data in combined_pangenome_clusters_Second_Type.items():
614
- calc_Second_only_core(groups, cores, data[1])
198
+ if data[1] >= 1:
199
+ calc_Second_only_core(cluster, data[1], groups, cores)
615
200
  for cluster, data in combined_pangenome_clusters_ONLY_Second_Type.items():
616
- if data[1] >= 2:
617
- calc_only_Second_only_core(groups, cores, data[1])
201
+ if data[1] >= 1:
202
+ calc_only_Second_only_core(cluster, data[1], groups, cores)
618
203
  ###########################
619
- key_order = ['first_core_', 'extended_core_', 'combined_core_', 'second_core_','only_second_core_']
620
- print("Gene Groups:")
621
- for key_prefix in key_order:
622
- for key, value in cores.items():
623
- if key.startswith(key_prefix):
624
- print(f"{key}: {len(value)}")
625
- print("Total Number of Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
626
-
204
+ ### Output
205
+ output_path = os.path.abspath(options.output_dir)
206
+ if not os.path.exists(output_path):
207
+ os.makedirs(output_path)
208
+ stats_out = os.path.join(output_path,'summary_statistics.txt')
209
+ key_order = ['First_core_', 'extended_core_', 'combined_core_', 'Second_core_','only_Second_core_']
210
+ with open(stats_out, 'w') as outfile:
211
+ print("Gene Groups:")
212
+ outfile.write("Gene Groups:\n")
213
+ for key_prefix in key_order:
214
+ for key, value in cores.items():
215
+ if key.startswith(key_prefix):
216
+ print(f"{key}: {len(value)}")
217
+ outfile.write(f"{key}: {len(value)}\n")
218
+ print("Total Number of First Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
219
+ outfile.write("Total Number of First Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
220
+ if options.reclustered!= None:
221
+ print("Total Number of Second Gene Groups (Including Singletons): " + str(
222
+ len(combined_pangenome_clusters_Second_sequences)))
223
+ print("Total Number of First Gene Groups That Had Additional Second Sequences But Not New Genomes: " + str(
224
+ Number_Of_Second_Extending_But_Same_Genomes))
225
+ outfile.write("\nTotal Number of Second Gene Groups (Including Singletons): " + str(
226
+ len(combined_pangenome_clusters_Second_sequences)))
227
+ outfile.write("\nTotal Number of First Gene Groups That Had Additional Second Sequences But Not New Genomes: " + str(
228
+ Number_Of_Second_Extending_But_Same_Genomes))
229
+ #Report number of first and second clusters and do the ame for genus
627
230
  if options.gene_presence_absence_out != None:
628
231
  gene_presence_absence_output(options,genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted)
629
232
 
630
- if options.write_families != None and options.fasta != None:
631
- sequences = read_fasta(options.fasta)
632
- input_dir = os.path.dirname(os.path.abspath(options.clusters))
633
- output_dir = os.path.join(input_dir, 'Gene_Families_Output')
634
233
 
635
- # Create output directory if it doesn't exist
636
- if not os.path.exists(output_dir):
637
- os.makedirs(output_dir)
638
- for key_prefix in key_order:
639
- for key, values in cores.items():
640
- if any(part in options.write_families.split(',') for part in key.split('_')):
641
- if key.startswith(key_prefix):
642
- for value in values:
643
- output_filename = f"{key}_{value}.fasta"
644
- sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
645
- # Write sequences to output file that are in the sequences dictionary
646
- with open(os.path.join(output_dir, output_filename), 'w') as outfile:
647
- for header in sequences_to_write:
648
- if header in sequences:
649
- outfile.write(f">{header}\n")
650
- wrapped_sequence = wrap_sequence(sequences[header])
651
- outfile.write(f"{wrapped_sequence}\n")
652
-
653
- if options.con_core != None and options.fasta != None and options.write_families != None:
654
- process_gene_families(options, os.path.join(input_dir, 'Gene_Families_Output'), 'concatonated_genes_aligned.fasta')
655
-
656
-
657
- # groups_dir = os.path.join(input_dir, 'Gene_Families_Output')
658
- # """Run mafft on all .fasta files in the given directory."""
659
- # for filename in os.listdir(groups_dir):
660
- # if filename.endswith('.fasta'):
661
- # input_path = os.path.join(groups_dir, filename)
662
- # output_filename = filename.replace('.fasta', '_mafft.aln')
663
- # output_path = os.path.join(groups_dir, output_filename)
664
- #
665
- # # Call mafft command
666
- # try:
667
- # with open(output_path, 'w') as output_file:
668
- # subprocess.run(
669
- # ['mafft', '--auto', input_path],
670
- # stdout=output_file,
671
- # stderr=subprocess.DEVNULL, # Suppress stderr
672
- # check=True
673
- # )
674
- # print(f"Processed {input_path} -> {output_path}")
675
- # except subprocess.CalledProcessError as e:
676
- # print(f"Failed to process {input_path}: {e}")
677
-
678
- ##This could be run once and not above AND here..
679
- # output_dir = os.path.dirname(os.path.abspath(options.clusters))
680
- # sequences = read_fasta(options.fasta)
681
- # concatenated_sequences = {genome: '' for genome in genome_dict.keys()}
682
- #
683
- #
684
- # for key_prefix in key_order:
685
- # for key, values in cores.items():
686
- # if any(part in options.con_core.split(',') for part in key.split('_')):
687
- # if key.startswith(key_prefix):
688
- # for value in values:
689
- # length_capture = {genome: [] for genome in genome_dict.keys()}
690
- # sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
691
- # for header in sequences_to_write:
692
- # if header in sequences:
693
- # length_capture[header.split('|')[0]].append([header,len(sequences[header])])
694
- # if all(bool(values) for values in length_capture.values()): # If a GF is not present in 'ALL' genomes, do not add to concat
695
- # for genome, lengths in length_capture.items():
696
- # max_value = float('-inf')
697
- # max_item = None
698
- # for length in lengths:
699
- # current_value = length[1]
700
- # if current_value > max_value:
701
- # max_value = current_value
702
- # max_item = length[0]
703
- # concatenated_sequences[genome.split('|')[0]] += sequences[max_item]
704
- #
705
- #
706
- # with open(os.path.join(output_dir, 'core_concat.fasta'), 'w') as outfile:
707
- # for genome, sequence in concatenated_sequences.items():
708
- # outfile.write(f">{genome}\n")
709
- # wrapped_sequence = wrap_sequence(sequence)
710
- # outfile.write(f"{wrapped_sequence}\n")
234
+ ###Need to fix this below. If full/partial the ifs need to be different. If full we first need to output the gfs then align. if -wruite-groups not presented then it needs
235
+ # to be done for alignment full anyway...
711
236
 
237
+ if options.run_mode == 'Full':
238
+ if options.reclustered == None:
239
+ combined_pangenome_clusters_Second_sequences = None
240
+ if options.write_groups != None:
241
+ print("Outputting gene group FASTA files")
242
+ sequences = read_fasta(options.fasta)
243
+ #output_dir = os.path.dirname(os.path.abspath(options.output_dir))
244
+ output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
245
+ write_groups(options,output_dir, key_order, cores, sequences,
246
+ pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
712
247
 
713
- # for core_gene_family in core_gene_families:
714
- # found_sequences = {genome: False for genome in genomes}
715
- #
716
- # for fasta_file in fasta_files:f
717
- # sequences = read_fasta(fasta_file)
718
- # for header, sequence in sequences.items():
719
- # genome = header.split('|')[0]
720
- # if genome in genomes and core_gene_family in header:
721
- # concatenated_sequences[genome] += sequence
722
- # found_sequences[genome] = True
723
- #
724
- # for genome in genomes:
725
- # if not found_sequences[genome]:
726
- # concatenated_sequences[genome] += '-' * len(next(iter(sequences.values())))
248
+ if options.align_core != None:
249
+ print("Processing gene group alignment")
250
+ process_gene_families(options, output_dir, 'concatenated_genes_aligned.fasta')
251
+
252
+ elif options.run_mode == 'Partial':
253
+ if options.reclustered == None:
254
+ combined_pangenome_clusters_Second_sequences = None
255
+ if options.write_groups != None and options.fasta != None:
256
+ print("Outputting gene group FASTA files")
257
+ sequences = read_fasta(options.fasta)
258
+ #output_dir = os.path.dirname(os.path.abspath(options.output_dir))
259
+ output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
260
+ write_groups(options,output_dir, key_order, cores, sequences,
261
+ pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
727
262
 
263
+ if options.align_core != None:
264
+ print("Processing gene group alignment")
265
+ process_gene_families(options, output_dir, 'concatenated_genes_aligned.fasta')
728
266
 
729
267
 
730
268
 
269
+ #
270
+ # if options.align_core != None:
271
+ # #output_dir = os.path.dirname(os.path.abspath(options.output_dir))
272
+ # output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
273
+ # if not os.path.exists(output_dir):
274
+ # os.makedirs(output_dir)
275
+ # process_gene_families(options, output_dir, 'concatenated_genes_aligned.fasta')
276
+
277
+ #
278
+ # elif options.run_mode == 'Partial':
279
+ # if options.align_core != None and options.fasta != None and options.write_groups != None:
280
+ # process_gene_families(options, os.path.join(output_dir, 'Gene_Families_Output'), 'concatenated_genes_aligned.fasta')
281
+ #
282
+ #
283
+ #
284
+ #
285
+ #
286
+