PyamilySeq 0.5.2__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,15 +11,16 @@ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
11
11
 
12
12
 
13
13
  def main():
14
- parser = argparse.ArgumentParser(description='Seq-Combiner ' + PyamilySeq_Version + ': Seq-Combiner Run Parameters.')
14
+ parser = argparse.ArgumentParser(description='Seq-Combiner ' + PyamilySeq_Version + ': A tool to extract sequences from GFF/FASTA files.')
15
15
  ### Required Arguments
16
16
  required = parser.add_argument_group('Required Arguments')
17
17
  required.add_argument('-input_dir', action='store', dest='input_dir',
18
18
  help='Directory location where the files are located.',
19
19
  required=True)
20
- required.add_argument("-input_type", action="store", dest="input_type", choices=['separate', 'combined'],
21
- help="Type of input files: 'separate' for separate FASTA and GFF files,"
22
- " 'combined' for GFF files with embedded FASTA sequences.",
20
+ required.add_argument('-input_type', action='store', dest='input_type', choices=['separate', 'combined', 'fasta'],
21
+ help='Type of input files: "separate" for separate FASTA and GFF files,'
22
+ ' "combined" for GFF files with embedded FASTA sequences and "fasta" for combining multiple '
23
+ 'FASTA files together.',
23
24
  required=True)
24
25
  required.add_argument("-name_split", action="store", dest="name_split",
25
26
  help="substring used to split the filename and extract the genome name ('_combined.gff3' or '.gff').",
@@ -30,15 +31,33 @@ def main():
30
31
  required.add_argument("-output_name", action="store", dest="output_file",
31
32
  help="Output file name.",
32
33
  required=True)
34
+ optional = parser.add_argument_group('Optional Arguments')
35
+ optional.add_argument('-gene_ident', action='store', dest='gene_ident', default='CDS',
36
+ help='Default - "CDS": Identifier used for extraction of sequences such as "misc_RNA,gene,mRNA,CDS,rRNA,tRNA,tmRNA,CRISPR,ncRNA,regulatory_region,oriC,pseudo"'
37
+ ' - Not compatible with "fasta" input mode.',
38
+ required=False)
39
+ optional.add_argument('-translate', action='store_true', dest='translate', default=None,
40
+ help='Default - False: Translate extracted sequences to their AA counterpart?',
41
+ required=False)
42
+ misc = parser.add_argument_group('Misc Arguments')
43
+ misc.add_argument('-v', action='store_true', dest='version',
44
+ help='Print out version number and exit',
45
+ required=False)
46
+
33
47
  options = parser.parse_args()
34
48
 
49
+ if options.version:
50
+ sys.exit(PyamilySeq_Version)
51
+
35
52
  output_path = os.path.abspath(options.output_dir)
36
53
  combined_out_file = os.path.join(output_path, options.output_file)
37
54
 
38
55
  if options.input_type == 'separate':
39
- read_separate_files(options.input_dir, options.name_split, )
40
- else:
41
- read_combined_files(options.input_dir, options.name_split, combined_out_file)
56
+ read_separate_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, options.translate)
57
+ elif options.input_type == 'combined':
58
+ read_combined_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, options.translate)
59
+ elif options.input_type == 'fasta':
60
+ read_fasta_files(options.input_dir, options.name_split, combined_out_file, options.translate)
42
61
 
43
62
  if __name__ == "__main__":
44
63
  main()
@@ -0,0 +1,362 @@
1
+
2
+ import sys
3
+ import copy
4
+ from collections import OrderedDict
5
+ from collections import defaultdict
6
+ from collections import Counter
7
+
8
+ def cluster_CDHIT(options, splitter):
9
+ First_in = open(options.clusters, 'r')
10
+ clusters = OrderedDict()
11
+ pangenome_clusters_First = OrderedDict()
12
+ pangenome_clusters_First_genomes = OrderedDict()
13
+ pangenome_clusters_First_sequences = OrderedDict()
14
+ first = True
15
+ taxa_dict = defaultdict(int)
16
+ reps = OrderedDict()
17
+ tmp_genomes = None
18
+ ## Load in all data for easier reuse later
19
+ for line in First_in:
20
+ if line.startswith('>'):
21
+ if tmp_genomes != None:
22
+ pangenome_clusters_First_genomes[rep] = tmp_genomes
23
+ tmp_genomes = []
24
+ if first == False:
25
+ cluster_size = len(clusters[cluster_id])
26
+ reps.update({rep: [cluster_size, len(pangenome_clusters_First[cluster_id])]})
27
+ cluster_id = line.strip('>')
28
+ cluster_id = cluster_id.strip('\n')
29
+ cluster_id = cluster_id.split(' ')[1]
30
+ clusters.update({cluster_id: []})
31
+ pangenome_clusters_First.update({cluster_id: []})
32
+ pangenome_clusters_First_sequences.update({cluster_id: []})
33
+ first = False
34
+ else:
35
+ clustered = line.split('\t')[1]
36
+ clustered = clustered.split('>')[1]
37
+ clustered = clustered.split('...')[0]
38
+ taxa = clustered.split(splitter)[0]
39
+ taxa_dict[taxa] += 1
40
+ if '*' in line:
41
+ rep = clustered
42
+ reps.update({rep: [0, 0]})
43
+ if first == False:
44
+ clusters[cluster_id].append(clustered)
45
+ clustered_taxa = clustered.split(splitter)[0]
46
+ if clustered_taxa not in pangenome_clusters_First[cluster_id]:
47
+ pangenome_clusters_First[cluster_id].append(clustered_taxa)
48
+ tmp_genomes.append(clustered_taxa)
49
+ pangenome_clusters_First_sequences[cluster_id].append(clustered)
50
+
51
+ pangenome_clusters_First_genomes[rep] = tmp_genomes
52
+
53
+ return taxa_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps
54
+
55
+
56
+
57
+ #@profile
58
+ def combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered, pangenome_clusters_First_genomes, splitter):
59
+ num_clustered_First = defaultdict(list)
60
+ pangenome_clusters_Type = copy.deepcopy(pangenome_clusters_First)
61
+ list_of_reps = list(reps.keys())
62
+ for cluster, First_genomes in pangenome_clusters_First.items():
63
+ rep = list_of_reps[int(cluster)] # get the rep of the current pep cluster
64
+ Com_PEP_Genomes = 0
65
+ Seconds = 0
66
+ seen_Seconds = []
67
+ added_Second_genomes = 0
68
+ temp_pep_genomes = copy.deepcopy(First_genomes)
69
+ try: # get the cluster from the Second clusters which contains this rep
70
+ clustered_combined = combined_pangenome_clusters_First_Second_clustered[rep]
71
+ #We have to do this to correctly account for Seconds grouping multiple original First clusters
72
+ for clust in clustered_combined:
73
+ ### Get the original clustered genomes first:
74
+ if options.sequence_tag not in clust:
75
+ original_clustered_genomes = pangenome_clusters_First_genomes[clust]
76
+ for genome in original_clustered_genomes:
77
+ if genome not in temp_pep_genomes:
78
+ temp_pep_genomes.append(genome)
79
+
80
+ seen_clust_Genomes = []
81
+ num_clustered_First[cluster].append(rep + '_' + str(len(First_genomes)))
82
+ for clust in clustered_combined:
83
+ if options.sequence_tag not in clust: # Not good enough at the moment
84
+ clust_Genome = clust.split(splitter)[0]
85
+ if clust_Genome not in seen_clust_Genomes:
86
+ seen_clust_Genomes.append(clust_Genome)
87
+ if clust_Genome not in First_genomes:
88
+ Com_PEP_Genomes += 1
89
+ num_clustered_First[cluster].append(clust + '_' + str(reps[clust][1]))
90
+ elif options.sequence_tag in clust:
91
+ Seconds += 1
92
+ clust_Genome = clust.split(splitter)[0]
93
+ if clust_Genome not in seen_Seconds:
94
+ seen_Seconds.append(clust_Genome)
95
+ if clust_Genome not in seen_clust_Genomes:
96
+ seen_clust_Genomes.append(clust_Genome)
97
+ if clust_Genome not in temp_pep_genomes:
98
+ added_Second_genomes += 1
99
+ temp_pep_genomes.append(clust_Genome)
100
+ else:
101
+ sys.exit("Error: looking for sequence_tag")
102
+
103
+ size_of_pep_clusters = []
104
+ genomes = num_clustered_First[cluster]
105
+
106
+
107
+ if len(genomes) > 1: #!!# So that we don't double count - This still needs to account for whether the same genome/genus is present however. Probably need to unique ti
108
+ collecting_genomes = []
109
+ for genome in genomes:
110
+ genome = genome.rsplit('_', 1)
111
+ collecting_genomes.append(pangenome_clusters_First[str(list_of_reps.index(genome[0]))])
112
+ size_of_pep_clusters.append([str(list_of_reps.index(genome[0])) + ':' + genome[1]])
113
+ flattened_list = [item for sublist in collecting_genomes for item in sublist]
114
+ element_counts = Counter(flattened_list)
115
+ unique_elements = [element for element, count in element_counts.items() if count == 1]
116
+ sum_size_of_pep_clusters = len(unique_elements)
117
+ else:
118
+ genome = genomes[0].rsplit('_', 1)
119
+ size_of_pep_clusters.append([str(list_of_reps.index(genome[0]))+':'+genome[1]])
120
+ sum_size_of_pep_clusters = int(genome[1])
121
+
122
+ pangenome_clusters_Type[cluster] = [len(num_clustered_First[cluster]), sum_size_of_pep_clusters,
123
+ size_of_pep_clusters, added_Second_genomes, Seconds, len(seen_Seconds)]
124
+
125
+ except KeyError:
126
+ ###Singleton
127
+ num_First_genomes = [[str(cluster)+':'+str(len(First_genomes))]]
128
+ pangenome_clusters_Type[cluster] = [1, len(First_genomes), num_First_genomes, added_Second_genomes, Seconds,
129
+ len(seen_Seconds)]
130
+ # pangenome_clusters_Type = [Number of First clustered genomes or genera, Size of the cluster, Ditto, Added Seconds,Number of Seconds,Unique Seconds ]
131
+ return pangenome_clusters_Type
132
+
133
+
134
+ #@profile
135
+ def single_clustering_counting(pangenome_clusters_First, reps):
136
+ num_clustered_First = defaultdict(list)
137
+ recorded_First = []
138
+ pangenome_clusters_Type = copy.deepcopy(pangenome_clusters_First)
139
+ list_of_reps = list(reps.keys())
140
+ for cluster, First_taxa in pangenome_clusters_First.items():
141
+ rep = list_of_reps[int(cluster)] # get the rep of the current pep cluster
142
+
143
+ try: # get the cluster from the storf clusters which contains this rep
144
+ num_clustered_First[str(cluster)].append(rep + '_' + str(len(First_taxa)))
145
+ size_of_First_clusters = []
146
+ Firsts = num_clustered_First[str(cluster)]
147
+ for First in Firsts:
148
+ First = First.rsplit('_', 1)
149
+ size_of_First_clusters.append(int(First[1]))
150
+ recorded_First.append(First[0])
151
+ num_First_genomes = [[str(cluster) + ':' + str(len(First_taxa))]]
152
+ pangenome_clusters_Type[cluster] = [len(num_clustered_First[cluster]), sum(size_of_First_clusters),
153
+ num_First_genomes, 0, 0, 0]
154
+
155
+ except KeyError:
156
+ ###Singleton
157
+ num_First_genomes = [[str(cluster)+':'+str(len(First_taxa))]]
158
+ pangenome_clusters_Type[cluster] = [1, len(First_taxa), num_First_genomes, 0, 0, 0]
159
+
160
+ # pangenome_clusters_Type = [Number of First clustered genomes or genera, Size of the cluster, Ditto, 0,0,0 ]
161
+ return pangenome_clusters_Type
162
+
163
+
164
+
165
+ #@profile
166
+ def combined_clustering_CDHIT(options, taxa_dict, splitter):
167
+ Second_in = open(options.reclustered, 'r')
168
+ combined_pangenome_clusters_First = OrderedDict()
169
+ combined_pangenome_clusters_First_sequences = OrderedDict()
170
+ combined_pangenome_clusters_Second = OrderedDict()
171
+ combined_pangenome_clusters_Second_sequences = OrderedDict()
172
+ combined_pangenome_clusters_First_Second_clustered = OrderedDict()
173
+
174
+ not_Second_only_cluster_ids = []
175
+ already_seen_PEP = []
176
+ Combined_clusters = OrderedDict()
177
+ Combined_reps = OrderedDict()
178
+ first = True
179
+ for line in Second_in:
180
+ if line.startswith('>'):
181
+ if first == False:
182
+ cluster_size = len(Combined_clusters[cluster_id])
183
+ Combined_reps.update({rep: cluster_size})
184
+ for pep in combined_pangenome_clusters_First_sequences[cluster_id]:
185
+ if pep != []:
186
+ if pep in already_seen_PEP:
187
+ continue
188
+ else:
189
+ already_seen_PEP.append(pep)
190
+ if len(combined_pangenome_clusters_Second_sequences[cluster_id]) > 0 and len(combined_pangenome_clusters_First_sequences[cluster_id]) > 0:
191
+ if len(combined_pangenome_clusters_First_sequences[cluster_id]) > 1: # If we have clustered >1 First family, we need to record 1 as key and all others are val
192
+ all_but_first = combined_pangenome_clusters_First_sequences[cluster_id][1:]
193
+ storfs_clustered = combined_pangenome_clusters_Second_sequences[cluster_id]
194
+ VALUE = all_but_first + storfs_clustered
195
+ else:
196
+ VALUE = combined_pangenome_clusters_Second_sequences[cluster_id]
197
+ KEY = combined_pangenome_clusters_First_sequences[cluster_id][0]
198
+ combined_pangenome_clusters_First_Second_clustered.update({KEY: VALUE})
199
+ cluster_id = line.strip('>')
200
+ cluster_id = cluster_id.strip('\n')
201
+ cluster_id = cluster_id.split(' ')[1]
202
+ Combined_clusters.update({cluster_id: []})
203
+ combined_pangenome_clusters_First.update({cluster_id: []})
204
+ combined_pangenome_clusters_First_sequences.update({cluster_id: []})
205
+ combined_pangenome_clusters_Second.update({cluster_id: []})
206
+ combined_pangenome_clusters_Second_sequences.update({cluster_id: []})
207
+
208
+ first = False
209
+ else:
210
+ clustered = line.split('\t')[1]
211
+ clustered = clustered.split('>')[1]
212
+ clustered = clustered.split('...')[0]
213
+ genome = clustered.split(splitter)[0]
214
+ taxa_dict[genome] += 1
215
+ if '*' in line:
216
+ rep = clustered
217
+ Combined_reps.update({rep: 0})
218
+ if first == False:
219
+ Combined_clusters[cluster_id].append(clustered)
220
+ clustered_taxa = clustered.split(splitter)[0]
221
+ if options.sequence_tag in line:
222
+ if clustered_taxa not in combined_pangenome_clusters_Second[cluster_id]:
223
+ combined_pangenome_clusters_Second[cluster_id].append(clustered_taxa)
224
+ combined_pangenome_clusters_Second_sequences[cluster_id].append(clustered)
225
+ else:
226
+ if cluster_id not in not_Second_only_cluster_ids:
227
+ not_Second_only_cluster_ids.append(cluster_id)
228
+ if clustered_taxa not in combined_pangenome_clusters_First[cluster_id]:
229
+ combined_pangenome_clusters_First[cluster_id].append(clustered_taxa)
230
+ combined_pangenome_clusters_First_sequences[cluster_id].append(clustered)
231
+
232
+
233
+ return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences
234
+
235
+
236
+ def cluster_EdgeList(options,splitter):
237
+ if options.cluster_format == 'TSV':
238
+ separator = '\t'
239
+ elif options.cluster_format == 'CSV':
240
+ separator = ','
241
+ cluster_id = 0
242
+ last_rep = ''
243
+ first = True
244
+ First_in = open(options.clusters, 'r')
245
+ pangenome_clusters_First = OrderedDict()
246
+ pangenome_clusters_First_genomes = OrderedDict()
247
+ pangenome_clusters_First_sequences = OrderedDict()
248
+ taxa_dict = defaultdict(int)
249
+ reps = OrderedDict()
250
+ tmp_genomes = None
251
+ for line in First_in:
252
+ rep, child = line.strip().split(separator)
253
+ child_taxa = child.split(splitter)[0] # Extracting the genome identifier from the child sequence
254
+ # Counting occurrences of genomes
255
+ taxa_dict[child_taxa] += 1
256
+ if first == True:
257
+ pangenome_clusters_First['0'] = []
258
+ pangenome_clusters_First_sequences['0'] = []
259
+ first = False
260
+ tmp_genomes = []
261
+
262
+ if rep != last_rep and last_rep != '':
263
+ pangenome_clusters_First_genomes[rep] = tmp_genomes
264
+ tmp_genomes = []
265
+ cluster_id +=1
266
+ pangenome_clusters_First[str(cluster_id)] = []
267
+ pangenome_clusters_First_sequences[str(cluster_id)] = []
268
+ cluster_size = len(pangenome_clusters_First_sequences[str(cluster_id-1)])
269
+ reps.update({last_rep: [cluster_size, len(pangenome_clusters_First[str(cluster_id-1)])]})
270
+ pangenome_clusters_First[str(cluster_id)] = []
271
+ pangenome_clusters_First_sequences[str(cluster_id)] = []
272
+ if child_taxa not in pangenome_clusters_First[str(cluster_id)]:
273
+ pangenome_clusters_First[str(cluster_id)].append(child_taxa)
274
+ tmp_genomes.append(child_taxa)
275
+
276
+ pangenome_clusters_First_sequences[str(cluster_id)].append(child)
277
+ last_rep = rep
278
+ cluster_size = len(pangenome_clusters_First_sequences[str(cluster_id)])
279
+ reps.update({rep: [cluster_size, len(pangenome_clusters_First[str(cluster_id)])]})
280
+
281
+ #!!# May not be needed below
282
+ pangenome_clusters_First_genomes[rep] = tmp_genomes
283
+
284
+ return taxa_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps
285
+
286
+
287
+ def combined_clustering_Edge_List(options, splitter):
288
+ if options.cluster_format == 'TSV':
289
+ separator = '\t'
290
+ elif options.cluster_format == 'CSV':
291
+ separator = ','
292
+
293
+ cluster_id = 0
294
+ last_rep = ''
295
+ Second_in = open(options.reclustered, 'r')
296
+ combined_pangenome_clusters_First = OrderedDict()
297
+ combined_pangenome_clusters_First_sequences = OrderedDict()
298
+ combined_pangenome_clusters_Second = OrderedDict()
299
+ combined_pangenome_clusters_Second_sequences = OrderedDict()
300
+ combined_pangenome_clusters_First_Second_clustered = OrderedDict()
301
+
302
+ not_Second_only_cluster_ids = []
303
+ already_seen_PEP = []
304
+ Combined_clusters = OrderedDict()
305
+ Combined_reps = OrderedDict()
306
+ first = True
307
+ for line in Second_in:
308
+ rep, child = line.strip().split(separator)
309
+ child_taxa = child.split(splitter)[0] # Extracting the genome identifier from the child sequence
310
+
311
+ if first == True:
312
+ Combined_clusters.update({str(cluster_id): []})
313
+ combined_pangenome_clusters_First.update({str(cluster_id): []})
314
+ combined_pangenome_clusters_First_sequences.update({str(cluster_id): []})
315
+ combined_pangenome_clusters_Second.update({str(cluster_id): []})
316
+ combined_pangenome_clusters_Second_sequences.update({str(cluster_id): []})
317
+ Combined_reps.update({rep: 0})
318
+ first = False
319
+
320
+ if first == False:
321
+ if rep != last_rep and last_rep != '':
322
+ cluster_size = len(Combined_clusters[str(cluster_id)])
323
+ Combined_reps.update({rep: cluster_size})
324
+ for pep in combined_pangenome_clusters_First_sequences[str(cluster_id)]:
325
+ if pep != []:
326
+ if pep in already_seen_PEP:
327
+ continue
328
+ else:
329
+ already_seen_PEP.append(pep)
330
+ if len(combined_pangenome_clusters_Second_sequences[str(cluster_id)]) > 0 and len(combined_pangenome_clusters_First_sequences[str(cluster_id)]) > 0:
331
+ if len(combined_pangenome_clusters_First_sequences[str(cluster_id)]) > 1: # If we have clustered >1 PEP family, we need to record 1 as key and all others are val
332
+ all_but_first = combined_pangenome_clusters_First_sequences[str(cluster_id)][1:]
333
+ storfs_clustered = combined_pangenome_clusters_Second_sequences[str(cluster_id)]
334
+ VALUE = all_but_first + storfs_clustered
335
+ else:
336
+ VALUE = combined_pangenome_clusters_Second_sequences[str(cluster_id)]
337
+ KEY = combined_pangenome_clusters_First_sequences[str(cluster_id)][0]
338
+ combined_pangenome_clusters_First_Second_clustered.update({KEY: VALUE})
339
+
340
+ cluster_id += 1
341
+ Combined_clusters.update({str(cluster_id): []})
342
+ combined_pangenome_clusters_First.update({str(cluster_id): []})
343
+ combined_pangenome_clusters_First_sequences.update({str(cluster_id): []})
344
+ combined_pangenome_clusters_Second.update({str(cluster_id): []})
345
+ combined_pangenome_clusters_Second_sequences.update({str(cluster_id): []})
346
+ Combined_reps.update({rep: 0})
347
+
348
+ Combined_clusters[str(cluster_id)].append(child)
349
+ if options.sequence_tag in line:
350
+ if child_taxa not in combined_pangenome_clusters_Second[str(cluster_id)]:
351
+ combined_pangenome_clusters_Second[str(cluster_id)].append(child_taxa)
352
+ combined_pangenome_clusters_Second_sequences[str(cluster_id)].append(child)
353
+ else:
354
+ if str(cluster_id) not in not_Second_only_cluster_ids:
355
+ not_Second_only_cluster_ids.append(str(cluster_id)) # Tell us which StORF_Reporter clustered are unmatched to a PEP
356
+ if child_taxa not in combined_pangenome_clusters_First[str(cluster_id)]:
357
+ combined_pangenome_clusters_First[str(cluster_id)].append(child_taxa)
358
+ combined_pangenome_clusters_First_sequences[str(cluster_id)].append(child)
359
+
360
+ last_rep = rep
361
+
362
+ return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences