PyamilySeq 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
PyamilySeq/clusterings.py CHANGED
@@ -1,25 +1,26 @@
1
- import subprocess
2
- import shutil
3
- import os
4
- import glob
1
+
5
2
  import sys
6
3
  import copy
7
4
  from collections import OrderedDict
8
5
  from collections import defaultdict
6
+ from collections import Counter
9
7
 
10
8
  def cluster_CDHIT(options, splitter):
11
9
  First_in = open(options.clusters, 'r')
12
10
  clusters = OrderedDict()
13
11
  pangenome_clusters_First = OrderedDict()
12
+ pangenome_clusters_First_genomes = OrderedDict()
14
13
  pangenome_clusters_First_sequences = OrderedDict()
15
14
  first = True
16
15
  taxa_dict = defaultdict(int)
17
16
  reps = OrderedDict()
17
+ tmp_genomes = None
18
18
  ## Load in all data for easier reuse later
19
19
  for line in First_in:
20
- if '>Cluster 7575' in line:
21
- print()
22
20
  if line.startswith('>'):
21
+ if tmp_genomes != None:
22
+ pangenome_clusters_First_genomes[rep] = tmp_genomes
23
+ tmp_genomes = []
23
24
  if first == False:
24
25
  cluster_size = len(clusters[cluster_id])
25
26
  reps.update({rep: [cluster_size, len(pangenome_clusters_First[cluster_id])]})
@@ -29,7 +30,6 @@ def cluster_CDHIT(options, splitter):
29
30
  clusters.update({cluster_id: []})
30
31
  pangenome_clusters_First.update({cluster_id: []})
31
32
  pangenome_clusters_First_sequences.update({cluster_id: []})
32
-
33
33
  first = False
34
34
  else:
35
35
  clustered = line.split('\t')[1]
@@ -45,32 +45,46 @@ def cluster_CDHIT(options, splitter):
45
45
  clustered_taxa = clustered.split(splitter)[0]
46
46
  if clustered_taxa not in pangenome_clusters_First[cluster_id]:
47
47
  pangenome_clusters_First[cluster_id].append(clustered_taxa)
48
+ tmp_genomes.append(clustered_taxa)
48
49
  pangenome_clusters_First_sequences[cluster_id].append(clustered)
49
- return taxa_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps
50
+
51
+ pangenome_clusters_First_genomes[rep] = tmp_genomes
52
+
53
+ return taxa_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps
50
54
 
51
55
 
52
56
 
53
57
  #@profile
54
- def combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered, splitter):
58
+ def combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered, pangenome_clusters_First_genomes, splitter):
55
59
  num_clustered_First = defaultdict(list)
56
60
  pangenome_clusters_Type = copy.deepcopy(pangenome_clusters_First)
57
61
  list_of_reps = list(reps.keys())
58
- for cluster, pep_genomes in pangenome_clusters_First.items():
62
+ for cluster, First_genomes in pangenome_clusters_First.items():
59
63
  rep = list_of_reps[int(cluster)] # get the rep of the current pep cluster
60
64
  Com_PEP_Genomes = 0
61
65
  Seconds = 0
62
66
  seen_Seconds = []
63
67
  added_Second_genomes = 0
64
- try: # get the cluster from the storf clusters which contains this rep
65
- clustered_combined = combined_pangenome_clusters_First_Second_clustered[rep] # Not true clusters - I put a PEP as key myself
68
+ temp_pep_genomes = copy.deepcopy(First_genomes)
69
+ try: # get the cluster from the Second clusters which contains this rep
70
+ clustered_combined = combined_pangenome_clusters_First_Second_clustered[rep]
71
+ #We have to do this to correctly account for Seconds grouping multiple original First clusters
72
+ for clust in clustered_combined:
73
+ ### Get the original clustered genomes first:
74
+ if options.sequence_tag not in clust:
75
+ original_clustered_genomes = pangenome_clusters_First_genomes[clust]
76
+ for genome in original_clustered_genomes:
77
+ if genome not in temp_pep_genomes:
78
+ temp_pep_genomes.append(genome)
79
+
66
80
  seen_clust_Genomes = []
67
- num_clustered_First[cluster].append(rep + '_' + str(len(pep_genomes)))
81
+ num_clustered_First[cluster].append(rep + '_' + str(len(First_genomes)))
68
82
  for clust in clustered_combined:
69
83
  if options.sequence_tag not in clust: # Not good enough at the moment
70
84
  clust_Genome = clust.split(splitter)[0]
71
85
  if clust_Genome not in seen_clust_Genomes:
72
86
  seen_clust_Genomes.append(clust_Genome)
73
- if clust_Genome not in pep_genomes:
87
+ if clust_Genome not in First_genomes:
74
88
  Com_PEP_Genomes += 1
75
89
  num_clustered_First[cluster].append(clust + '_' + str(reps[clust][1]))
76
90
  elif options.sequence_tag in clust:
@@ -80,23 +94,38 @@ def combined_clustering_counting(options, pangenome_clusters_First, reps, combin
80
94
  seen_Seconds.append(clust_Genome)
81
95
  if clust_Genome not in seen_clust_Genomes:
82
96
  seen_clust_Genomes.append(clust_Genome)
83
- if clust_Genome not in pep_genomes:
97
+ if clust_Genome not in temp_pep_genomes:
84
98
  added_Second_genomes += 1
99
+ temp_pep_genomes.append(clust_Genome)
85
100
  else:
86
101
  sys.exit("Error: looking for sequence_tag")
87
102
 
88
103
  size_of_pep_clusters = []
89
- peps = num_clustered_First[cluster]
90
- for pep in peps:
91
- pep = pep.rsplit('_', 1)
92
- size_of_pep_clusters.append(int(pep[1]))
93
- pangenome_clusters_Type[cluster] = [len(num_clustered_First[cluster]), sum(size_of_pep_clusters),
104
+ genomes = num_clustered_First[cluster]
105
+
106
+
107
+ if len(genomes) > 1: #!!# So that we don't double count - This still needs to account for whether the same genome/genus is present however. Probably need to unique ti
108
+ collecting_genomes = []
109
+ for genome in genomes:
110
+ genome = genome.rsplit('_', 1)
111
+ collecting_genomes.append(pangenome_clusters_First[str(list_of_reps.index(genome[0]))])
112
+ size_of_pep_clusters.append([str(list_of_reps.index(genome[0])) + ':' + genome[1]])
113
+ flattened_list = [item for sublist in collecting_genomes for item in sublist]
114
+ element_counts = Counter(flattened_list)
115
+ unique_elements = [element for element, count in element_counts.items() if count == 1]
116
+ sum_size_of_pep_clusters = len(unique_elements)
117
+ else:
118
+ genome = genomes[0].rsplit('_', 1)
119
+ size_of_pep_clusters.append([str(list_of_reps.index(genome[0]))+':'+genome[1]])
120
+ sum_size_of_pep_clusters = int(genome[1])
121
+
122
+ pangenome_clusters_Type[cluster] = [len(num_clustered_First[cluster]), sum_size_of_pep_clusters,
94
123
  size_of_pep_clusters, added_Second_genomes, Seconds, len(seen_Seconds)]
95
124
 
96
125
  except KeyError:
97
126
  ###Singleton
98
- num_pep_genomes = [len(pep_genomes)]
99
- pangenome_clusters_Type[cluster] = [1, len(pep_genomes), num_pep_genomes, added_Second_genomes, Seconds,
127
+ num_First_genomes = [[str(cluster)+':'+str(len(First_genomes))]]
128
+ pangenome_clusters_Type[cluster] = [1, len(First_genomes), num_First_genomes, added_Second_genomes, Seconds,
100
129
  len(seen_Seconds)]
101
130
  # pangenome_clusters_Type = [Number of First clustered genomes or genera, Size of the cluster, Ditto, Added Seconds,Number of Seconds,Unique Seconds ]
102
131
  return pangenome_clusters_Type
@@ -112,20 +141,21 @@ def single_clustering_counting(pangenome_clusters_First, reps):
112
141
  rep = list_of_reps[int(cluster)] # get the rep of the current pep cluster
113
142
 
114
143
  try: # get the cluster from the storf clusters which contains this rep
115
- num_clustered_First[cluster].append(rep + '_' + str(len(First_taxa)))
144
+ num_clustered_First[str(cluster)].append(rep + '_' + str(len(First_taxa)))
116
145
  size_of_First_clusters = []
117
- Firsts = num_clustered_First[cluster]
146
+ Firsts = num_clustered_First[str(cluster)]
118
147
  for First in Firsts:
119
148
  First = First.rsplit('_', 1)
120
149
  size_of_First_clusters.append(int(First[1]))
121
150
  recorded_First.append(First[0])
151
+ num_First_genomes = [[str(cluster) + ':' + str(len(First_taxa))]]
122
152
  pangenome_clusters_Type[cluster] = [len(num_clustered_First[cluster]), sum(size_of_First_clusters),
123
- size_of_First_clusters, 0, 0, 0]
153
+ num_First_genomes, 0, 0, 0]
124
154
 
125
155
  except KeyError:
126
156
  ###Singleton
127
- num_First_taxa = [len(First_taxa)]
128
- pangenome_clusters_Type[cluster] = [1, len(First_taxa), num_First_taxa, 0, 0, 0]
157
+ num_First_genomes = [[str(cluster)+':'+str(len(First_taxa))]]
158
+ pangenome_clusters_Type[cluster] = [1, len(First_taxa), num_First_genomes, 0, 0, 0]
129
159
 
130
160
  # pangenome_clusters_Type = [Number of First clustered genomes or genera, Size of the cluster, Ditto, 0,0,0 ]
131
161
  return pangenome_clusters_Type
@@ -158,7 +188,7 @@ def combined_clustering_CDHIT(options, taxa_dict, splitter):
158
188
  else:
159
189
  already_seen_PEP.append(pep)
160
190
  if len(combined_pangenome_clusters_Second_sequences[cluster_id]) > 0 and len(combined_pangenome_clusters_First_sequences[cluster_id]) > 0:
161
- if len(combined_pangenome_clusters_First_sequences[cluster_id]) > 1: # If we have clustered >1 PEP family, we need to record 1 as key and all others are val
191
+ if len(combined_pangenome_clusters_First_sequences[cluster_id]) > 1: # If we have clustered >1 First family, we need to record 1 as key and all others are val
162
192
  all_but_first = combined_pangenome_clusters_First_sequences[cluster_id][1:]
163
193
  storfs_clustered = combined_pangenome_clusters_Second_sequences[cluster_id]
164
194
  VALUE = all_but_first + storfs_clustered
@@ -194,13 +224,13 @@ def combined_clustering_CDHIT(options, taxa_dict, splitter):
194
224
  combined_pangenome_clusters_Second_sequences[cluster_id].append(clustered)
195
225
  else:
196
226
  if cluster_id not in not_Second_only_cluster_ids:
197
- not_Second_only_cluster_ids.append(cluster_id) # Tell us which StORF_Reporter clustered are unmatched to a PEP
227
+ not_Second_only_cluster_ids.append(cluster_id)
198
228
  if clustered_taxa not in combined_pangenome_clusters_First[cluster_id]:
199
229
  combined_pangenome_clusters_First[cluster_id].append(clustered_taxa)
200
230
  combined_pangenome_clusters_First_sequences[cluster_id].append(clustered)
201
231
 
202
232
 
203
- return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second
233
+ return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences
204
234
 
205
235
 
206
236
  def cluster_EdgeList(options,splitter):
@@ -213,37 +243,45 @@ def cluster_EdgeList(options,splitter):
213
243
  first = True
214
244
  First_in = open(options.clusters, 'r')
215
245
  pangenome_clusters_First = OrderedDict()
246
+ pangenome_clusters_First_genomes = OrderedDict()
216
247
  pangenome_clusters_First_sequences = OrderedDict()
217
248
  taxa_dict = defaultdict(int)
218
249
  reps = OrderedDict()
250
+ tmp_genomes = None
219
251
  for line in First_in:
220
252
  rep, child = line.strip().split(separator)
221
253
  child_taxa = child.split(splitter)[0] # Extracting the genome identifier from the child sequence
222
254
  # Counting occurrences of genomes
223
255
  taxa_dict[child_taxa] += 1
224
256
  if first == True:
225
- pangenome_clusters_First[0] = []
226
- pangenome_clusters_First_sequences[0] = []
257
+ pangenome_clusters_First['0'] = []
258
+ pangenome_clusters_First_sequences['0'] = []
227
259
  first = False
260
+ tmp_genomes = []
228
261
 
229
262
  if rep != last_rep and last_rep != '':
263
+ pangenome_clusters_First_genomes[rep] = tmp_genomes
264
+ tmp_genomes = []
230
265
  cluster_id +=1
231
- pangenome_clusters_First[cluster_id] = []
232
- pangenome_clusters_First_sequences[cluster_id] = []
233
- cluster_size = len(pangenome_clusters_First_sequences[cluster_id-1])
234
- reps.update({last_rep: [cluster_size, len(pangenome_clusters_First[cluster_id-1])]})
235
- pangenome_clusters_First[cluster_id] = []
236
- pangenome_clusters_First_sequences[cluster_id] = []
237
- if child_taxa not in pangenome_clusters_First[cluster_id]:
238
- pangenome_clusters_First[cluster_id].append(child_taxa)
239
-
240
- pangenome_clusters_First_sequences[cluster_id].append(child)
266
+ pangenome_clusters_First[str(cluster_id)] = []
267
+ pangenome_clusters_First_sequences[str(cluster_id)] = []
268
+ cluster_size = len(pangenome_clusters_First_sequences[str(cluster_id-1)])
269
+ reps.update({last_rep: [cluster_size, len(pangenome_clusters_First[str(cluster_id-1)])]})
270
+ pangenome_clusters_First[str(cluster_id)] = []
271
+ pangenome_clusters_First_sequences[str(cluster_id)] = []
272
+ if child_taxa not in pangenome_clusters_First[str(cluster_id)]:
273
+ pangenome_clusters_First[str(cluster_id)].append(child_taxa)
274
+ tmp_genomes.append(child_taxa)
275
+
276
+ pangenome_clusters_First_sequences[str(cluster_id)].append(child)
241
277
  last_rep = rep
242
- cluster_size = len(pangenome_clusters_First_sequences[cluster_id])
243
- reps.update({rep: [cluster_size, len(pangenome_clusters_First[cluster_id])]})
278
+ cluster_size = len(pangenome_clusters_First_sequences[str(cluster_id)])
279
+ reps.update({rep: [cluster_size, len(pangenome_clusters_First[str(cluster_id)])]})
244
280
 
281
+ #!!# May not be needed below
282
+ pangenome_clusters_First_genomes[rep] = tmp_genomes
245
283
 
246
- return taxa_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps
284
+ return taxa_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps
247
285
 
248
286
 
249
287
  def combined_clustering_Edge_List(options, splitter):
@@ -271,54 +309,54 @@ def combined_clustering_Edge_List(options, splitter):
271
309
  child_taxa = child.split(splitter)[0] # Extracting the genome identifier from the child sequence
272
310
 
273
311
  if first == True:
274
- Combined_clusters.update({cluster_id: []})
275
- combined_pangenome_clusters_First.update({cluster_id: []})
276
- combined_pangenome_clusters_First_sequences.update({cluster_id: []})
277
- combined_pangenome_clusters_Second.update({cluster_id: []})
278
- combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
312
+ Combined_clusters.update({str(cluster_id): []})
313
+ combined_pangenome_clusters_First.update({str(cluster_id): []})
314
+ combined_pangenome_clusters_First_sequences.update({str(cluster_id): []})
315
+ combined_pangenome_clusters_Second.update({str(cluster_id): []})
316
+ combined_pangenome_clusters_Second_sequences.update({str(cluster_id): []})
279
317
  Combined_reps.update({rep: 0})
280
318
  first = False
281
319
 
282
320
  if first == False:
283
321
  if rep != last_rep and last_rep != '':
284
- cluster_size = len(Combined_clusters[cluster_id])
322
+ cluster_size = len(Combined_clusters[str(cluster_id)])
285
323
  Combined_reps.update({rep: cluster_size})
286
- for pep in combined_pangenome_clusters_First_sequences[cluster_id]:
324
+ for pep in combined_pangenome_clusters_First_sequences[str(cluster_id)]:
287
325
  if pep != []:
288
326
  if pep in already_seen_PEP:
289
327
  continue
290
328
  else:
291
329
  already_seen_PEP.append(pep)
292
- if len(combined_pangenome_clusters_Second_sequences[cluster_id]) > 0 and len(combined_pangenome_clusters_First_sequences[cluster_id]) > 0:
293
- if len(combined_pangenome_clusters_First_sequences[cluster_id]) > 1: # If we have clustered >1 PEP family, we need to record 1 as key and all others are val
294
- all_but_first = combined_pangenome_clusters_First_sequences[cluster_id][1:]
295
- storfs_clustered = combined_pangenome_clusters_Second_sequences[cluster_id]
330
+ if len(combined_pangenome_clusters_Second_sequences[str(cluster_id)]) > 0 and len(combined_pangenome_clusters_First_sequences[str(cluster_id)]) > 0:
331
+ if len(combined_pangenome_clusters_First_sequences[str(cluster_id)]) > 1: # If we have clustered >1 PEP family, we need to record 1 as key and all others are val
332
+ all_but_first = combined_pangenome_clusters_First_sequences[str(cluster_id)][1:]
333
+ storfs_clustered = combined_pangenome_clusters_Second_sequences[str(cluster_id)]
296
334
  VALUE = all_but_first + storfs_clustered
297
335
  else:
298
- VALUE = combined_pangenome_clusters_Second_sequences[cluster_id]
299
- KEY = combined_pangenome_clusters_First_sequences[cluster_id][0]
336
+ VALUE = combined_pangenome_clusters_Second_sequences[str(cluster_id)]
337
+ KEY = combined_pangenome_clusters_First_sequences[str(cluster_id)][0]
300
338
  combined_pangenome_clusters_First_Second_clustered.update({KEY: VALUE})
301
339
 
302
340
  cluster_id += 1
303
- Combined_clusters.update({cluster_id: []})
304
- combined_pangenome_clusters_First.update({cluster_id: []})
305
- combined_pangenome_clusters_First_sequences.update({cluster_id: []})
306
- combined_pangenome_clusters_Second.update({cluster_id: []})
307
- combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
341
+ Combined_clusters.update({str(cluster_id): []})
342
+ combined_pangenome_clusters_First.update({str(cluster_id): []})
343
+ combined_pangenome_clusters_First_sequences.update({str(cluster_id): []})
344
+ combined_pangenome_clusters_Second.update({str(cluster_id): []})
345
+ combined_pangenome_clusters_Second_sequences.update({str(cluster_id): []})
308
346
  Combined_reps.update({rep: 0})
309
347
 
310
- Combined_clusters[cluster_id].append(child)
348
+ Combined_clusters[str(cluster_id)].append(child)
311
349
  if options.sequence_tag in line:
312
- if child_taxa not in combined_pangenome_clusters_Second[cluster_id]:
313
- combined_pangenome_clusters_Second[cluster_id].append(child_taxa)
314
- combined_pangenome_clusters_Second_sequences[cluster_id].append(child)
350
+ if child_taxa not in combined_pangenome_clusters_Second[str(cluster_id)]:
351
+ combined_pangenome_clusters_Second[str(cluster_id)].append(child_taxa)
352
+ combined_pangenome_clusters_Second_sequences[str(cluster_id)].append(child)
315
353
  else:
316
- if cluster_id not in not_Second_only_cluster_ids:
317
- not_Second_only_cluster_ids.append(cluster_id) # Tell us which StORF_Reporter clustered are unmatched to a PEP
318
- if child_taxa not in combined_pangenome_clusters_First[cluster_id]:
319
- combined_pangenome_clusters_First[cluster_id].append(child_taxa)
320
- combined_pangenome_clusters_First_sequences[cluster_id].append(child)
354
+ if str(cluster_id) not in not_Second_only_cluster_ids:
355
+ not_Second_only_cluster_ids.append(str(cluster_id)) # Tell us which StORF_Reporter clustered are unmatched to a PEP
356
+ if child_taxa not in combined_pangenome_clusters_First[str(cluster_id)]:
357
+ combined_pangenome_clusters_First[str(cluster_id)].append(child_taxa)
358
+ combined_pangenome_clusters_First_sequences[str(cluster_id)].append(child)
321
359
 
322
360
  last_rep = rep
323
361
 
324
- return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second
362
+ return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences
PyamilySeq/utils.py CHANGED
@@ -7,6 +7,29 @@ from tempfile import NamedTemporaryFile
7
7
  import sys
8
8
 
9
9
 
10
+ ################### We are currently fixed using Table 11
11
+ gencode = {
12
+ 'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
13
+ 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
14
+ 'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
15
+ 'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
16
+ 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
17
+ 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
18
+ 'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
19
+ 'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
20
+ 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
21
+ 'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
22
+ 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
23
+ 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
24
+ 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
25
+ 'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
26
+ 'TAC':'Y', 'TAT':'Y', 'TAA':'*', 'TAG':'*',
27
+ 'TGC':'C', 'TGT':'C', 'TGA':'*', 'TGG':'W'}
28
+
29
+ def translate_frame(sequence):
30
+ translate = ''.join([gencode.get(sequence[3 * i:3 * i + 3], 'X') for i in range(len(sequence) // 3)])
31
+ return translate
32
+
10
33
  def is_tool_installed(tool_name):
11
34
  """Check if a tool is installed and available in PATH."""
12
35
  # Check if the tool is in the system PATH
@@ -75,7 +98,7 @@ def select_longest_gene(sequences):
75
98
 
76
99
 
77
100
  def run_mafft_on_sequences(options, sequences, output_file):
78
- print("Conducting MAFFT alignment.")
101
+ #print("Conducting MAFFT alignment.")
79
102
  """Run mafft on the given sequences and write to output file."""
80
103
  # Create a temporary input file for mafft
81
104
  with NamedTemporaryFile('w', delete=False) as temp_input_file:
@@ -106,7 +129,7 @@ def run_mafft_on_sequences(options, sequences, output_file):
106
129
 
107
130
 
108
131
 
109
- def read_separate_files(input_dir, name_split, combined_out):
132
+ def read_separate_files(input_dir, name_split, gene_ident, combined_out, translate):
110
133
  with open(combined_out, 'w') as combined_out_file:
111
134
  for gff_file in glob.glob(os.path.join(input_dir, '*' + name_split)):
112
135
  genome_name = os.path.basename(gff_file).split(name_split)[0]
@@ -121,7 +144,7 @@ def read_separate_files(input_dir, name_split, combined_out):
121
144
  for line in lines:
122
145
  line_data = line.split('\t')
123
146
  if len(line_data) == 9:
124
- if line_data[2] == 'CDS':
147
+ if any(gene_type in line_data[2] for gene_type in gene_ident):
125
148
  contig = line_data[0]
126
149
  feature = line_data[2]
127
150
  strand = line_data[6]
@@ -158,12 +181,13 @@ def read_separate_files(input_dir, name_split, combined_out):
158
181
  corrected_stop = max(len(fasta_dict[contig][0]) - int(start - 1), 1)
159
182
  full_sequence = fasta_dict[contig][1]
160
183
  cds_sequence = full_sequence[corrected_start:corrected_stop]
161
-
184
+ if translate == True:
185
+ cds_sequence = translate_frame(cds_sequence)
162
186
  wrapped_sequence = '\n'.join([cds_sequence[i:i + 60] for i in range(0, len(cds_sequence), 60)])
163
187
  combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
164
188
 
165
189
 
166
- def read_combined_files(input_dir, name_split, combined_out):
190
+ def read_combined_files(input_dir, name_split, gene_ident, combined_out, translate):
167
191
  with open(combined_out, 'w') as combined_out_file:
168
192
  for gff_file in glob.glob(os.path.join(input_dir, '*' + name_split)):
169
193
  genome_name = os.path.basename(gff_file).split(name_split)[0]
@@ -186,7 +210,7 @@ def read_combined_files(input_dir, name_split, combined_out):
186
210
  else:
187
211
  line_data = line.split('\t')
188
212
  if len(line_data) == 9:
189
- if line_data[2] == 'CDS':
213
+ if any(gene_type in line_data[2] for gene_type in gene_ident):
190
214
  contig = line_data[0]
191
215
  feature = line_data[2]
192
216
  strand = line_data[6]
@@ -215,5 +239,91 @@ def read_combined_files(input_dir, name_split, combined_out):
215
239
  full_sequence = fasta_dict[contig][1]
216
240
  cds_sequence = full_sequence[corrected_start:corrected_stop]
217
241
 
242
+ if translate == True:
243
+ cds_sequence = translate_frame(cds_sequence)
218
244
  wrapped_sequence = '\n'.join([cds_sequence[i:i + 60] for i in range(0, len(cds_sequence), 60)])
219
- combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
245
+ combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
246
+
247
+
248
+ def read_fasta_files(input_dir, name_split, combined_out, translate):
249
+ with open(combined_out, 'w') as combined_out_file:
250
+ for fasta_file in glob.glob(os.path.join(input_dir, '*' + name_split)):
251
+ genome_name = os.path.basename(fasta_file).split(name_split)[0]
252
+ fasta_dict = collections.defaultdict(str)
253
+ with open(fasta_file, 'r') as file:
254
+ lines = file.readlines()
255
+ for line in lines:
256
+ if line.startswith('>'):
257
+ current_seq = line[1:].split()[0]
258
+ fasta_dict[current_seq] = ''
259
+ else:
260
+ fasta_dict[current_seq] +=line.strip()
261
+ for id, seq in fasta_dict.items():
262
+ if translate == True:
263
+ seq = translate_frame(seq)
264
+ wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
265
+ combined_out_file.write(f">{genome_name}|{id}\n{wrapped_sequence}\n")
266
+
267
+
268
+ def write_groups(options,output_dir, key_order, cores, sequences, pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences):
269
+ # Create output directory if it doesn't exist
270
+ if not os.path.exists(output_dir):
271
+ os.makedirs(output_dir)
272
+ for key_prefix in key_order:
273
+ for key, values in cores.items():
274
+ if any(part in options.write_groups.split(',') for part in key.split('_')):
275
+ if key.startswith(key_prefix):
276
+ for value in values:
277
+ output_filename = f"{key}_{value}.fasta"
278
+ if 'First' in key_prefix:
279
+ sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
280
+ else: # combined_pangenome_clusters_Second_sequences is None if reclustered isn't being used
281
+ sequences_to_write = combined_pangenome_clusters_Second_sequences[value]
282
+ # Write sequences to output file that are in the sequences dictionary
283
+ with open(os.path.join(output_dir, output_filename), 'w') as outfile:
284
+ for header in sequences_to_write:
285
+ if header in sequences:
286
+ outfile.write(f">{header}\n")
287
+ wrapped_sequence = wrap_sequence(sequences[header])
288
+ outfile.write(f"{wrapped_sequence}\n")
289
+ else:
290
+ if options.verbose == True:
291
+ print("Sequence " + header + " Not found in original_fasta file.")
292
+
293
+ def process_gene_families(options, directory, output_file):
294
+ """Process each gene family file to select the longest sequence per genome and concatenate aligned sequences."""
295
+ concatenated_sequences = {}
296
+ output_file = directory.replace('Gene_Families_Output',output_file)
297
+
298
+ # Iterate over each gene family file
299
+ for gene_file in os.listdir(directory):
300
+ if gene_file.endswith('.fasta'):
301
+ gene_path = os.path.join(directory, gene_file)
302
+
303
+ # Read sequences from the gene family file
304
+ sequences = read_fasta(gene_path)
305
+
306
+ # Select the longest sequence for each genome
307
+ longest_sequences = select_longest_gene(sequences)
308
+
309
+ # Run mafft on the longest sequences
310
+ aligned_file = f"{gene_file}_aligned.fasta"
311
+ run_mafft_on_sequences(options, {seq_id: seq for seq_id, seq in longest_sequences.values()}, aligned_file)
312
+
313
+ # Read aligned sequences and concatenate them
314
+ aligned_sequences = read_fasta(aligned_file)
315
+ for genome, aligned_seq in aligned_sequences.items():
316
+ genome_name = genome.split('|')[0]
317
+ if genome_name not in concatenated_sequences:
318
+ concatenated_sequences[genome_name] = ""
319
+ concatenated_sequences[genome_name] += aligned_seq
320
+
321
+ # Clean up aligned file
322
+ os.remove(aligned_file)
323
+
324
+ # Write the concatenated sequences to the output file
325
+ with open(output_file, 'w') as out:
326
+ for genome, sequence in concatenated_sequences.items():
327
+ out.write(f">{genome}\n")
328
+ wrapped_sequence = wrap_sequence(sequence, 60)
329
+ out.write(f"{wrapped_sequence}\n")