PyamilySeq 0.5.2__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,67 +1,21 @@
1
1
  #from line_profiler_pycharm import profile
2
2
 
3
- from collections import OrderedDict,defaultdict
4
3
  import copy
5
4
  import math
6
5
  import sys
7
- from tempfile import NamedTemporaryFile
8
6
 
9
7
 
10
8
 
11
9
  try:
12
10
  from .Constants import *
11
+ from .clusterings import *
13
12
  from .utils import *
14
13
  except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
15
14
  from Constants import *
15
+ from clusterings import *
16
16
  from utils import *
17
17
 
18
18
 
19
- def custom_sort_key(k, dict1, dict2):
20
- return (len(dict1[k]), len(dict2[k]))
21
-
22
- def sort_keys_by_values(dict1, dict2):
23
- sorted_keys = sorted(dict1.keys(), key=lambda k: custom_sort_key(k, dict1, dict2), reverse=True)
24
- return sorted_keys
25
-
26
- def select_longest_gene(sequences):
27
- """Select the longest sequence for each genome."""
28
- longest_sequences = {}
29
- for seq_id, sequence in sequences.items():
30
- genome = seq_id.split('|')[0] # Assuming genome name can be derived from the sequence ID
31
- if genome not in longest_sequences or len(sequence) > len(longest_sequences[genome][1]):
32
- longest_sequences[genome] = (seq_id, sequence)
33
- return longest_sequences
34
-
35
-
36
- def run_mafft_on_sequences(options, sequences, output_file):
37
- """Run mafft on the given sequences and write to output file."""
38
- # Create a temporary input file for mafft
39
- with NamedTemporaryFile('w', delete=False) as temp_input_file:
40
- for header, sequence in sequences.items():
41
- temp_input_file.write(f">{header}\n{sequence}\n")
42
- temp_input_file_path = temp_input_file.name
43
-
44
- # Run mafft
45
- try:
46
- with open(output_file, 'w') as output_f:
47
- if options.verbose == 'True':
48
- subprocess.run(
49
- ['mafft', '--auto', temp_input_file_path],
50
- stdout=output_f,
51
- stderr=sys.stderr,
52
- check=True
53
- )
54
- else:
55
- subprocess.run(
56
- ['mafft', '--auto', temp_input_file_path],
57
- stdout=output_f,
58
- stderr=subprocess.DEVNULL, # Suppress stderr
59
- check=True
60
- )
61
- finally:
62
- os.remove(temp_input_file_path) # Clean up the temporary file
63
-
64
-
65
19
  def process_gene_families(options, directory, output_file):
66
20
  """Process each gene family file to select the longest sequence per genome and concatenate aligned sequences."""
67
21
  concatenated_sequences = {}
@@ -102,8 +56,10 @@ def process_gene_families(options, directory, output_file):
102
56
 
103
57
  def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
104
58
  print("Outputting gene_presence_absence file")
105
- in_name = options.clusters.split('.')[0]
106
- gpa_outfile = open(in_name+'_gene_presence_absence.csv','w')
59
+ output_dir = os.path.abspath(options.output_dir)
60
+ in_name = options.clusters.split('.')[0].split('/')[-1]
61
+ gpa_outfile = os.path.join(output_dir, in_name)
62
+ gpa_outfile = open(gpa_outfile+'_gene_presence_absence.csv','w')
107
63
  gpa_outfile.write('"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","'
108
64
  '"Accessory Fragment","Accessory Order with Fragment","QC","Min group size nuc","Max group size nuc","Avg group size nuc","')
109
65
  gpa_outfile.write('","'.join(genome_dict.keys()))
@@ -113,14 +69,17 @@ def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_
113
69
  gpa_outfile.write('"group_'+str(cluster)+'","","'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
114
70
  '","","","","","","","","",""')
115
71
 
116
- full_out = ''
72
+
117
73
  for genome in genome_dict.keys():
74
+ full_out = ''
118
75
  tmp_list = []
119
76
  for value in sequences:
120
77
  if value.split('|')[0] == genome:
121
78
  tmp_list.append(value)
122
79
  if tmp_list:
123
80
  full_out += ',"'+''.join(tmp_list)+'"'
81
+ else:
82
+ full_out = ',""'
124
83
  gpa_outfile.write(full_out)
125
84
  gpa_outfile.write('\n')
126
85
 
@@ -137,31 +96,7 @@ def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_
137
96
  # edge_list_outfile.write(line + '\n')
138
97
 
139
98
 
140
- def wrap_sequence(sequence, width=60):
141
- wrapped_sequence = []
142
- for i in range(0, len(sequence), width):
143
- wrapped_sequence.append(sequence[i:i + width])
144
- return "\n".join(wrapped_sequence)
145
-
146
-
147
- def read_fasta(fasta_file):
148
- sequences = {}
149
- current_sequence = None
150
- with open(fasta_file, 'r') as file:
151
- for line in file:
152
- line = line.strip()
153
- if not line:
154
- continue
155
- if line.startswith('>'):
156
- current_sequence = line[1:]
157
- sequences[current_sequence] = ''
158
- else:
159
- sequences[current_sequence] += line
160
- return sequences
161
-
162
99
 
163
- def reorder_dict_by_keys(original_dict, sorted_keys):
164
- return {k: original_dict[k] for k in sorted_keys}
165
100
 
166
101
  def get_cores(options,genome_dict):
167
102
  ##Calculate core groups
@@ -172,409 +107,89 @@ def get_cores(options,genome_dict):
172
107
  for group in options.core_groups.split(','):
173
108
  calculated_floor = math.floor(int(group) / 100 * len(genome_dict))
174
109
  if first == False:
175
- # Ensure no overlap
176
- # if calculated_floor <= prev_top:
177
- # calculated_floor = prev_top - 1
178
-
179
110
  groups[group] = (calculated_floor,prev_top)
180
111
  else:
181
112
  groups[group] = (calculated_floor, prev_top)
182
113
  first = False
183
114
  prev_top = calculated_floor
184
- first_core_group = 'first_core_' + group
115
+ first_core_group = 'First_core_' + group
185
116
  cores[first_core_group] = []
186
117
  if options.reclustered != None:
187
118
  extended_core_group = 'extended_core_' + group
188
119
  cores[extended_core_group] = []
189
120
  combined_core_group = 'combined_core_' + group
190
121
  cores[combined_core_group] = []
191
- second_core_group = 'second_core_' + group
122
+ second_core_group = 'Second_core_' + group
192
123
  cores[second_core_group] = []
193
- only_second_core_group = 'only_second_core_' + group
124
+ only_second_core_group = 'only_Second_core_' + group
194
125
  cores[only_second_core_group] = []
195
126
  return cores, groups
196
127
 
197
128
  #@profile
198
- def calc_First_only_core(cluster, pep_num, groups, cores):
129
+ def calc_First_only_core(cluster, First_num, groups, cores):
199
130
  groups_as_list = list(groups.values())
200
- for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= pep_num <= fir):
131
+ for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= First_num <= fir):
201
132
  res = idx
202
133
  family_group = list(groups)[res]
203
- cores['first_core_'+family_group].append(cluster)
134
+ cores['First_core_'+family_group].append(cluster)
204
135
 
205
136
  #@profile
206
- def calc_single_First_extended_Second_only_core(pep_num, groups, cores, second_num): # Count gene families extended with StORFs
137
+ def calc_single_First_extended_Second_only_core(cluster, First_num, groups, cores, Second_num): # Count gene families extended with StORFs
207
138
  groups_as_list = list(groups.values())
208
- for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= pep_num+second_num <= fir):
139
+ for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= First_num+Second_num <= fir):
209
140
  res = idx
210
141
  family_group = list(groups)[res]
211
- cores['extended_core_' + family_group].append(pep_num)
142
+ cores['extended_core_' + family_group].append(cluster)
212
143
 
213
144
 
214
145
  #@profile
215
- def calc_multi_First_extended_Second_only_core(pep_num, groups, cores, second_num): # Count seperately those gene families extended with StORF_Reporter but combined >1 PEP
146
+ def calc_multi_First_extended_Second_only_core(cluster, First_num, groups, cores, Second_num): # Count seperately those gene families extended with StORF_Reporter but combined >1 PEP
216
147
  groups_as_list = list(groups.values())
217
- for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= pep_num+second_num <= fir):
148
+ for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= First_num+Second_num <= fir):
218
149
  res = idx
219
150
  family_group = list(groups)[res]
220
- cores['combined_core_' + family_group] += 1
151
+ cores['combined_core_' + family_group].append(cluster)
221
152
 
222
153
 
223
154
  #@profile
224
- def calc_Second_only_core(groups, cores, second_num):
155
+ def calc_Second_only_core(cluster, Second_num, groups, cores):
225
156
  groups_as_list = list(groups.values())
226
- for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= second_num <= fir):
157
+ for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= Second_num <= fir):
227
158
  res = idx
228
159
  family_group = list(groups)[res]
229
- cores['second_core_' + family_group] += 1
160
+ cores['Second_core_' + family_group].append(cluster)
230
161
 
231
162
  #@profile
232
- def calc_only_Second_only_core(groups, cores, second_num): # only count the true storf onlies
163
+ def calc_only_Second_only_core(cluster, Second_num, groups, cores): # only count the true storf onlies
233
164
  groups_as_list = list(groups.values())
234
- for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= second_num <= fir):
165
+ for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= Second_num <= fir):
235
166
  res = idx
236
167
  family_group = list(groups)[res]
237
- cores['only_second_core_' + family_group] += 1
238
-
239
-
240
-
241
-
242
-
243
- #@profile
244
- def combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered):
245
- num_clustered_First = defaultdict(list)
246
- pangenome_clusters_Type = copy.deepcopy(pangenome_clusters_First)
247
- list_of_reps = list(reps.keys())
248
- for cluster, pep_genomes in pangenome_clusters_First.items():
249
- rep = list_of_reps[int(cluster)] # get the rep of the current pep cluster
250
- Com_PEP_Genomes = 0
251
- Seconds = 0
252
- seen_Seconds = []
253
- added_Second_genomes = 0
254
- try: # get the cluster from the storf clusters which contains this rep
255
- clustered_combined = combined_pangenome_clusters_First_Second_clustered[rep] # Not true clusters - I put a PEP as key myself
256
- seen_clust_Genomes = []
257
- num_clustered_First[cluster].append(rep + '_' + str(len(pep_genomes)))
258
- for clust in clustered_combined:
259
- if options.sequence_tag not in clust: # Not good enough at the moment
260
- clust_Genome = clust.split('|')[0]
261
- if clust_Genome not in seen_clust_Genomes:
262
- seen_clust_Genomes.append(clust_Genome)
263
- if clust_Genome not in pep_genomes:
264
- Com_PEP_Genomes += 1
265
- num_clustered_First[cluster].append(clust + '_' + str(reps[clust][1]))
266
- elif options.sequence_tag in clust:
267
- Seconds += 1
268
- clust_Genome = clust.split('|')[0]
269
- if clust_Genome not in seen_Seconds:
270
- seen_Seconds.append(clust_Genome)
271
- if clust_Genome not in seen_clust_Genomes:
272
- seen_clust_Genomes.append(clust_Genome)
273
- if clust_Genome not in pep_genomes:
274
- added_Second_genomes += 1
275
- else:
276
- sys.exit("Error: looking for sequence_tag")
277
-
278
- size_of_pep_clusters = []
279
- peps = num_clustered_First[cluster]
280
- for pep in peps:
281
- pep = pep.rsplit('_', 1)
282
- size_of_pep_clusters.append(int(pep[1]))
283
- pangenome_clusters_Type[cluster] = [len(num_clustered_First[cluster]), sum(size_of_pep_clusters),
284
- size_of_pep_clusters, added_Second_genomes, Seconds, len(seen_Seconds)]
285
-
286
- except KeyError:
287
- ###Singleton
288
- num_pep_genomes = [len(pep_genomes)]
289
- pangenome_clusters_Type[cluster] = [1, len(pep_genomes), num_pep_genomes, added_Second_genomes, Seconds,
290
- len(seen_Seconds)]
291
-
292
- return pangenome_clusters_Type
293
-
294
- #@profile
295
- def single_clustering_counting(options, pangenome_clusters_First, reps):
296
- num_clustered_First = defaultdict(list)
297
- recorded_First = []
298
- pangenome_clusters_Type = copy.deepcopy(pangenome_clusters_First)
299
- list_of_reps = list(reps.keys())
300
- for cluster, First_genomes in pangenome_clusters_First.items():
301
- rep = list_of_reps[int(cluster)] # get the rep of the current pep cluster
302
-
303
- try: # get the cluster from the storf clusters which contains this rep
304
- num_clustered_First[cluster].append(rep + '_' + str(len(First_genomes)))
305
- size_of_First_clusters = []
306
- Firsts = num_clustered_First[cluster]
307
- for First in Firsts:
308
- First = First.rsplit('_', 1)
309
- size_of_First_clusters.append(int(First[1]))
310
- recorded_First.append(First[0])
311
- pangenome_clusters_Type[cluster] = [len(num_clustered_First[cluster]), sum(size_of_First_clusters),
312
- size_of_First_clusters, 0, 0, 0]
313
-
314
- except KeyError:
315
- ###Singleton
316
- num_pep_genomes = [len(First_genomes)]
317
- pangenome_clusters_Type[cluster] = [1, len(First_genomes), num_pep_genomes, 0, 0, 0]
318
-
319
- return pangenome_clusters_Type
320
-
321
-
322
-
323
- #@profile
324
- def combined_clustering_CDHIT(options, genome_dict):
325
- unique_genomes = []
326
- Second_in = open(options.reclustered, 'r')
327
- combined_pangenome_clusters_First = OrderedDict()
328
- combined_pangenome_clusters_First_sequences = OrderedDict()
329
- combined_pangenome_clusters_Second = OrderedDict()
330
- combined_pangenome_clusters_Second_sequences = OrderedDict()
331
- combined_pangenome_clusters_First_Second_clustered = OrderedDict()
332
-
333
- not_Second_only_cluster_ids = []
334
- already_seen_PEP = []
335
- Combined_clusters = OrderedDict()
336
- Combined_reps = OrderedDict()
337
- first = True
338
- for line in Second_in:
339
- if line.startswith('>'):
340
- if first == False:
341
- cluster_size = len(Combined_clusters[cluster_id])
342
- Combined_reps.update({rep: cluster_size})
343
- for pep in combined_pangenome_clusters_First_sequences[cluster_id]:
344
- if pep != []:
345
- if pep in already_seen_PEP:
346
- continue
347
- else:
348
- already_seen_PEP.append(pep)
349
- if len(combined_pangenome_clusters_Second_sequences[cluster_id]) > 0 and len(combined_pangenome_clusters_First_sequences[cluster_id]) > 0:
350
- if len(combined_pangenome_clusters_First_sequences[cluster_id]) > 1: # If we have clustered >1 PEP family, we need to record 1 as key and all others are val
351
- all_but_first = combined_pangenome_clusters_First_sequences[cluster_id][1:]
352
- storfs_clustered = combined_pangenome_clusters_Second_sequences[cluster_id]
353
- VALUE = all_but_first + storfs_clustered
354
- else:
355
- VALUE = combined_pangenome_clusters_Second_sequences[cluster_id]
356
- KEY = combined_pangenome_clusters_First_sequences[cluster_id][0]
357
- combined_pangenome_clusters_First_Second_clustered.update({KEY: VALUE})
358
- cluster_id = line.strip('>')
359
- cluster_id = cluster_id.strip('\n')
360
- cluster_id = cluster_id.split(' ')[1]
361
- Combined_clusters.update({cluster_id: []})
362
- combined_pangenome_clusters_First.update({cluster_id: []})
363
- combined_pangenome_clusters_First_sequences.update({cluster_id: []})
364
- combined_pangenome_clusters_Second.update({cluster_id: []})
365
- combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
366
-
367
- first = False
368
- else:
369
- clustered = line.split('\t')[1]
370
- clustered = clustered.split('>')[1]
371
- clustered = clustered.split('...')[0]
372
- genome = clustered.split('|')[0]
373
- genome_dict[genome] += 1
374
- if '*' in line:
375
- rep = clustered
376
- Combined_reps.update({rep: 0})
377
- if first == False:
378
- Combined_clusters[cluster_id].append(clustered)
379
- clustered_genome = clustered.split('|')[0]
380
- if options.sequence_tag in line:
381
- if clustered_genome not in combined_pangenome_clusters_Second[cluster_id]:
382
- combined_pangenome_clusters_Second[cluster_id].append(clustered_genome)
383
- combined_pangenome_clusters_Second_sequences[cluster_id].append(clustered)
384
- else:
385
- if cluster_id not in not_Second_only_cluster_ids:
386
- not_Second_only_cluster_ids.append(cluster_id) # Tell us which StORF_Reporter clustered are unmatched to a PEP
387
- if clustered_genome not in combined_pangenome_clusters_First[cluster_id]:
388
- combined_pangenome_clusters_First[cluster_id].append(clustered_genome)
389
- combined_pangenome_clusters_First_sequences[cluster_id].append(clustered)
390
-
391
-
392
- return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second, unique_genomes
393
-
394
- def combined_clustering_Edge_List(options, genome_dict):
395
- if options.cluster_format == 'TSV':
396
- separator = '\t'
397
- elif options.cluster_format == 'CSV':
398
- separator = ','
399
- unique_genomes = []
400
- cluster_id = 0
401
- last_rep = ''
402
- Second_in = open(options.reclustered, 'r')
403
- combined_pangenome_clusters_First = OrderedDict()
404
- combined_pangenome_clusters_First_sequences = OrderedDict()
405
- combined_pangenome_clusters_Second = OrderedDict()
406
- combined_pangenome_clusters_Second_sequences = OrderedDict()
407
- combined_pangenome_clusters_First_Second_clustered = OrderedDict()
408
-
409
- not_Second_only_cluster_ids = []
410
- already_seen_PEP = []
411
- Combined_clusters = OrderedDict()
412
- Combined_reps = OrderedDict()
413
- first = True
414
- for line in Second_in:
415
- rep, child = line.strip().split(separator)
416
- child_genome = child.split('|')[0] # Extracting the genome identifier from the child sequence
417
-
418
- if first == True:
419
- Combined_clusters.update({cluster_id: []})
420
- combined_pangenome_clusters_First.update({cluster_id: []})
421
- combined_pangenome_clusters_First_sequences.update({cluster_id: []})
422
- combined_pangenome_clusters_Second.update({cluster_id: []})
423
- combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
424
- Combined_reps.update({rep: 0})
425
- first = False
426
-
427
- if first == False:
428
- if rep != last_rep and last_rep != '':
429
- cluster_size = len(Combined_clusters[cluster_id])
430
- Combined_reps.update({rep: cluster_size})
431
- for pep in combined_pangenome_clusters_First_sequences[cluster_id]:
432
- if pep != []:
433
- if pep in already_seen_PEP:
434
- continue
435
- else:
436
- already_seen_PEP.append(pep)
437
- if len(combined_pangenome_clusters_Second_sequences[cluster_id]) > 0 and len(combined_pangenome_clusters_First_sequences[cluster_id]) > 0:
438
- if len(combined_pangenome_clusters_First_sequences[cluster_id]) > 1: # If we have clustered >1 PEP family, we need to record 1 as key and all others are val
439
- all_but_first = combined_pangenome_clusters_First_sequences[cluster_id][1:]
440
- storfs_clustered = combined_pangenome_clusters_Second_sequences[cluster_id]
441
- VALUE = all_but_first + storfs_clustered
442
- else:
443
- VALUE = combined_pangenome_clusters_Second_sequences[cluster_id]
444
- KEY = combined_pangenome_clusters_First_sequences[cluster_id][0]
445
- combined_pangenome_clusters_First_Second_clustered.update({KEY: VALUE})
446
-
447
- cluster_id += 1
448
- Combined_clusters.update({cluster_id: []})
449
- combined_pangenome_clusters_First.update({cluster_id: []})
450
- combined_pangenome_clusters_First_sequences.update({cluster_id: []})
451
- combined_pangenome_clusters_Second.update({cluster_id: []})
452
- combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
453
- Combined_reps.update({rep: 0})
454
-
455
-
456
- Combined_clusters[cluster_id].append(child)
457
- if options.sequence_tag in line:
458
- if child_genome not in combined_pangenome_clusters_Second[cluster_id]:
459
- combined_pangenome_clusters_Second[cluster_id].append(child_genome)
460
- combined_pangenome_clusters_Second_sequences[cluster_id].append(child)
461
- else:
462
- if cluster_id not in not_Second_only_cluster_ids:
463
- not_Second_only_cluster_ids.append(cluster_id) # Tell us which StORF_Reporter clustered are unmatched to a PEP
464
- if child_genome not in combined_pangenome_clusters_First[cluster_id]:
465
- combined_pangenome_clusters_First[cluster_id].append(child_genome)
466
- combined_pangenome_clusters_First_sequences[cluster_id].append(child)
467
-
468
- last_rep = rep
469
-
470
- return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second, unique_genomes
471
-
472
-
473
- def cluster_EdgeList(options):
474
- if options.cluster_format == 'TSV':
475
- separator = '\t'
476
- elif options.cluster_format == 'CSV':
477
- separator = ','
478
- cluster_id = 0
479
- last_rep = ''
480
- first = True
481
- First_in = open(options.clusters, 'r')
482
- pangenome_clusters_First = OrderedDict()
483
- pangenome_clusters_First_sequences = OrderedDict()
484
- genome_dict = defaultdict(int)
485
- reps = OrderedDict()
486
- for line in First_in:
487
- rep, child = line.strip().split(separator)
488
- child_genome = child.split('|')[0] # Extracting the genome identifier from the child sequence
489
- # Counting occurrences of genomes
490
- genome_dict[child_genome] += 1
491
- if first == True:
492
- pangenome_clusters_First[0] = []
493
- pangenome_clusters_First_sequences[0] = []
494
- first = False
495
-
496
- if rep != last_rep and last_rep != '':
497
- cluster_id +=1
498
- pangenome_clusters_First[cluster_id] = []
499
- pangenome_clusters_First_sequences[cluster_id] = []
500
- cluster_size = len(pangenome_clusters_First_sequences[cluster_id-1])
501
- reps.update({last_rep: [cluster_size, len(pangenome_clusters_First[cluster_id-1])]})
502
- pangenome_clusters_First[cluster_id] = []
503
- pangenome_clusters_First_sequences[cluster_id] = []
504
- if child_genome not in pangenome_clusters_First[cluster_id]:
505
- pangenome_clusters_First[cluster_id].append(child_genome)
506
-
507
- pangenome_clusters_First_sequences[cluster_id].append(child)
508
- last_rep = rep
509
- cluster_size = len(pangenome_clusters_First_sequences[cluster_id])
510
- reps.update({rep: [cluster_size, len(pangenome_clusters_First[cluster_id])]})
511
-
512
-
513
- return genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps
168
+ cores['only_Second_core_' + family_group].append(cluster)
514
169
 
515
170
 
516
171
 
517
- def cluster_CDHIT(options):
518
- First_in = open(options.clusters, 'r')
519
- clusters = OrderedDict()
520
- pangenome_clusters_First = OrderedDict()
521
- pangenome_clusters_First_sequences = OrderedDict()
522
- first = True
523
- genome_dict = defaultdict(int)
524
- reps = OrderedDict()
525
- ## Load in all data for easier reuse later
526
- for line in First_in:
527
- if line.startswith('>'):
528
- if first == False:
529
- cluster_size = len(clusters[cluster_id])
530
- reps.update({rep: [cluster_size, len(pangenome_clusters_First[cluster_id])]})
531
- cluster_id = line.strip('>')
532
- cluster_id = cluster_id.strip('\n')
533
- cluster_id = cluster_id.split(' ')[1]
534
- clusters.update({cluster_id: []})
535
- pangenome_clusters_First.update({cluster_id: []})
536
- pangenome_clusters_First_sequences.update({cluster_id: []})
537
-
538
- first = False
539
- else:
540
- clustered = line.split('\t')[1]
541
- clustered = clustered.split('>')[1]
542
- clustered = clustered.split('...')[0]
543
- genome = clustered.split('|')[0]
544
- genome_dict[genome] += 1
545
- if '*' in line:
546
- rep = clustered
547
- reps.update({rep: [0, 0]})
548
- if first == False:
549
- clusters[cluster_id].append(clustered)
550
- clustered_genome = clustered.split('|')[0]
551
- if clustered_genome not in pangenome_clusters_First[cluster_id]:
552
- pangenome_clusters_First[cluster_id].append(clustered_genome)
553
- pangenome_clusters_First_sequences[cluster_id].append(clustered)
554
- return genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps
555
-
556
172
  #@profile
557
173
  def cluster(options):
558
174
 
559
175
  if options.cluster_format == 'CD-HIT':
560
- genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options)
176
+ genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options, '|')
561
177
  elif options.cluster_format in ['TSV','CSV']:
562
- genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_EdgeList(options)
178
+ genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_EdgeList(options, '|')
563
179
 
564
- ######################################
180
+ ###
565
181
  cores, groups = get_cores(options, genome_dict)
566
182
  ###
567
183
 
568
184
  if options.reclustered != None:
569
185
  if options.cluster_format == 'CD-HIT':
570
- combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second,\
571
- unique_genomes = combined_clustering_CDHIT(options, genome_dict)
572
- if options.cluster_format == 'TSV':
573
- combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second,\
574
- unique_genomes = combined_clustering_Edge_List(options, genome_dict)
575
- pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered)
186
+ combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second = combined_clustering_CDHIT(options, genome_dict, '|')
187
+ if options.cluster_format == ['TSV','CSV']:
188
+ combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second = combined_clustering_Edge_List(options, '|')
189
+
190
+ pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered, '|')
576
191
  else:
577
- pangenome_clusters_Type = single_clustering_counting(options, pangenome_clusters_First, reps)
192
+ pangenome_clusters_Type = single_clustering_counting(pangenome_clusters_First, reps)
578
193
 
579
194
 
580
195
 
@@ -588,21 +203,17 @@ def cluster(options):
588
203
  print("Calculating Groups")
589
204
  for cluster, numbers in pangenome_clusters_Type_sorted.items():
590
205
  ############################### Calculate First only
591
- #if numbers[0] == 1 and numbers[1] >=2:
592
206
  calc_First_only_core(cluster, numbers[1],groups,cores)
593
207
 
594
- # elif numbers[0] >1 and numbers[1] >=2:
595
- # calc_First_only_core(cluster, numbers[2][0],groups,cores)
596
-
597
-
598
208
  if options.reclustered != None:
599
209
  ############################# Calculate First and Reclustered-Second
600
210
  if numbers[0] == 1 and numbers[3] >= 1: # If Seconds did not combine First reps
601
- calc_single_First_extended_Second_only_core(numbers[1], groups, cores, numbers[3])
211
+ calc_single_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
602
212
  elif numbers[0] > 1 and numbers[3] >= 1: # If unique Secondss combined multiple Firsts
603
- calc_multi_First_extended_Second_only_core(numbers[1], groups, cores, numbers[3])
213
+ calc_multi_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
604
214
  elif numbers[4] >= 1:
605
215
  Number_Of_Second_Extending_But_Same_Genomes += 1
216
+
606
217
  combined_pangenome_clusters_ONLY_Second_Type = defaultdict(list)
607
218
  combined_pangenome_clusters_Second_Type = defaultdict(list)
608
219
  for cluster, genomes in combined_pangenome_clusters_Second.items():
@@ -611,26 +222,34 @@ def cluster(options):
611
222
  else:
612
223
  combined_pangenome_clusters_ONLY_Second_Type[cluster] = [cluster, len(genomes)]
613
224
  for cluster, data in combined_pangenome_clusters_Second_Type.items():
614
- calc_Second_only_core(groups, cores, data[1])
225
+ if data[1] >= 1:
226
+ calc_Second_only_core(cluster, data[1], groups, cores)
615
227
  for cluster, data in combined_pangenome_clusters_ONLY_Second_Type.items():
616
- if data[1] >= 2:
617
- calc_only_Second_only_core(groups, cores, data[1])
228
+ if data[1] >= 1:
229
+ calc_only_Second_only_core(cluster, data[1], groups, cores)
618
230
  ###########################
619
- key_order = ['first_core_', 'extended_core_', 'combined_core_', 'second_core_','only_second_core_']
620
- print("Gene Groups:")
621
- for key_prefix in key_order:
622
- for key, value in cores.items():
623
- if key.startswith(key_prefix):
624
- print(f"{key}: {len(value)}")
625
- print("Total Number of Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
231
+ ### Output
232
+ output_path = os.path.abspath(options.output_dir)
233
+ stats_out = os.path.join(output_path,'summary_statistics.txt')
234
+ key_order = ['First_core_', 'extended_core_', 'combined_core_', 'Second_core_','only_Second_core_']
235
+ with open(stats_out, 'w') as outfile:
236
+ print("Gene Groups:")
237
+ outfile.write("Gene Groups:\n")
238
+ for key_prefix in key_order:
239
+ for key, value in cores.items():
240
+ if key.startswith(key_prefix):
241
+ print(f"{key}: {len(value)}")
242
+ outfile.write(f"{key}: {len(value)}\n")
243
+ print("Total Number of Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
244
+ outfile.write("Total Number of Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
626
245
 
627
246
  if options.gene_presence_absence_out != None:
628
247
  gene_presence_absence_output(options,genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted)
629
248
 
630
249
  if options.write_families != None and options.fasta != None:
631
250
  sequences = read_fasta(options.fasta)
632
- input_dir = os.path.dirname(os.path.abspath(options.clusters))
633
- output_dir = os.path.join(input_dir, 'Gene_Families_Output')
251
+ output_dir = os.path.dirname(os.path.abspath(options.output_dir))
252
+ output_dir = os.path.join(output_dir, 'Gene_Families_Output')
634
253
 
635
254
  # Create output directory if it doesn't exist
636
255
  if not os.path.exists(output_dir):
@@ -651,79 +270,9 @@ def cluster(options):
651
270
  outfile.write(f"{wrapped_sequence}\n")
652
271
 
653
272
  if options.con_core != None and options.fasta != None and options.write_families != None:
654
- process_gene_families(options, os.path.join(input_dir, 'Gene_Families_Output'), 'concatonated_genes_aligned.fasta')
655
-
656
-
657
- # groups_dir = os.path.join(input_dir, 'Gene_Families_Output')
658
- # """Run mafft on all .fasta files in the given directory."""
659
- # for filename in os.listdir(groups_dir):
660
- # if filename.endswith('.fasta'):
661
- # input_path = os.path.join(groups_dir, filename)
662
- # output_filename = filename.replace('.fasta', '_mafft.aln')
663
- # output_path = os.path.join(groups_dir, output_filename)
664
- #
665
- # # Call mafft command
666
- # try:
667
- # with open(output_path, 'w') as output_file:
668
- # subprocess.run(
669
- # ['mafft', '--auto', input_path],
670
- # stdout=output_file,
671
- # stderr=subprocess.DEVNULL, # Suppress stderr
672
- # check=True
673
- # )
674
- # print(f"Processed {input_path} -> {output_path}")
675
- # except subprocess.CalledProcessError as e:
676
- # print(f"Failed to process {input_path}: {e}")
677
-
678
- ##This could be run once and not above AND here..
679
- # output_dir = os.path.dirname(os.path.abspath(options.clusters))
680
- # sequences = read_fasta(options.fasta)
681
- # concatenated_sequences = {genome: '' for genome in genome_dict.keys()}
682
- #
683
- #
684
- # for key_prefix in key_order:
685
- # for key, values in cores.items():
686
- # if any(part in options.con_core.split(',') for part in key.split('_')):
687
- # if key.startswith(key_prefix):
688
- # for value in values:
689
- # length_capture = {genome: [] for genome in genome_dict.keys()}
690
- # sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
691
- # for header in sequences_to_write:
692
- # if header in sequences:
693
- # length_capture[header.split('|')[0]].append([header,len(sequences[header])])
694
- # if all(bool(values) for values in length_capture.values()): # If a GF is not present in 'ALL' genomes, do not add to concat
695
- # for genome, lengths in length_capture.items():
696
- # max_value = float('-inf')
697
- # max_item = None
698
- # for length in lengths:
699
- # current_value = length[1]
700
- # if current_value > max_value:
701
- # max_value = current_value
702
- # max_item = length[0]
703
- # concatenated_sequences[genome.split('|')[0]] += sequences[max_item]
704
- #
705
- #
706
- # with open(os.path.join(output_dir, 'core_concat.fasta'), 'w') as outfile:
707
- # for genome, sequence in concatenated_sequences.items():
708
- # outfile.write(f">{genome}\n")
709
- # wrapped_sequence = wrap_sequence(sequence)
710
- # outfile.write(f"{wrapped_sequence}\n")
711
-
712
-
713
- # for core_gene_family in core_gene_families:
714
- # found_sequences = {genome: False for genome in genomes}
715
- #
716
- # for fasta_file in fasta_files:f
717
- # sequences = read_fasta(fasta_file)
718
- # for header, sequence in sequences.items():
719
- # genome = header.split('|')[0]
720
- # if genome in genomes and core_gene_family in header:
721
- # concatenated_sequences[genome] += sequence
722
- # found_sequences[genome] = True
723
- #
724
- # for genome in genomes:
725
- # if not found_sequences[genome]:
726
- # concatenated_sequences[genome] += '-' * len(next(iter(sequences.values())))
273
+ process_gene_families(options, os.path.join(output_dir, 'Gene_Families_Output'), 'concatonated_genes_aligned.fasta')
274
+
275
+
727
276
 
728
277
 
729
278