PyamilySeq 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyamilySeq/Cluster_Summary.py +163 -0
- PyamilySeq/Group_Extractor.py +83 -0
- PyamilySeq/Group_Sizes.py +87 -0
- PyamilySeq/Group_Splitter.py +571 -0
- PyamilySeq/PyamilySeq.py +321 -0
- PyamilySeq/PyamilySeq_Genus.py +242 -0
- PyamilySeq/PyamilySeq_Species.py +309 -0
- PyamilySeq/Seq_Combiner.py +83 -0
- PyamilySeq/Seq_Extractor.py +64 -0
- PyamilySeq/Seq_Finder.py +56 -0
- PyamilySeq/__init__.py +0 -0
- PyamilySeq/clusterings.py +452 -0
- PyamilySeq/constants.py +2 -0
- PyamilySeq/utils.py +598 -0
- pyamilyseq-1.1.0.dist-info/METADATA +342 -0
- pyamilyseq-1.1.0.dist-info/RECORD +20 -0
- {PyamilySeq-1.0.0.dist-info → pyamilyseq-1.1.0.dist-info}/WHEEL +1 -1
- pyamilyseq-1.1.0.dist-info/entry_points.txt +13 -0
- pyamilyseq-1.1.0.dist-info/top_level.txt +1 -0
- PyamilySeq-1.0.0.dist-info/METADATA +0 -17
- PyamilySeq-1.0.0.dist-info/RECORD +0 -6
- PyamilySeq-1.0.0.dist-info/entry_points.txt +0 -2
- PyamilySeq-1.0.0.dist-info/top_level.txt +0 -1
- {PyamilySeq-1.0.0.dist-info → pyamilyseq-1.1.0.dist-info}/LICENSE +0 -0
|
@@ -0,0 +1,309 @@
|
|
|
1
|
+
|
|
2
|
+
try:
|
|
3
|
+
from .constants import *
|
|
4
|
+
from .clusterings import *
|
|
5
|
+
from .utils import *
|
|
6
|
+
except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
|
|
7
|
+
from constants import *
|
|
8
|
+
from clusterings import *
|
|
9
|
+
from utils import *
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
|
|
13
|
+
print("Outputting gene_presence_absence file")
|
|
14
|
+
output_dir = os.path.abspath(options.output_dir)
|
|
15
|
+
#in_name = options.clusters.split('.')[0].split('/')[-1]
|
|
16
|
+
gpa_outfile = os.path.join(output_dir, 'gene_presence_absence.csv')
|
|
17
|
+
gpa_outfile = open(gpa_outfile, 'w')
|
|
18
|
+
gpa_outfile.write('"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","'
|
|
19
|
+
'"Accessory Fragment","Accessory Order with Fragment","QC","Min group size nuc","Max group size nuc","Avg group size nuc","')
|
|
20
|
+
gpa_outfile.write('","'.join(genome_dict.keys()))
|
|
21
|
+
gpa_outfile.write('"\n')
|
|
22
|
+
for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
|
|
23
|
+
average_sequences_per_genome = len(sequences) / len(pangenome_clusters_First_sorted[cluster])
|
|
24
|
+
gpa_outfile.write('"group_'+str(cluster)+'","","","'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
|
|
25
|
+
'","","","","","","","","",""')
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
for genome in genome_dict.keys():
|
|
29
|
+
full_out = ''
|
|
30
|
+
tmp_list = []
|
|
31
|
+
for value in sequences:
|
|
32
|
+
if value.split('|')[0] == genome:
|
|
33
|
+
tmp_list.append(value.split('|')[1])
|
|
34
|
+
if tmp_list:
|
|
35
|
+
full_out += ',"'+' '.join(tmp_list)+'"'
|
|
36
|
+
else:
|
|
37
|
+
full_out = ',""'
|
|
38
|
+
gpa_outfile.write(full_out)
|
|
39
|
+
gpa_outfile.write('\n')
|
|
40
|
+
|
|
41
|
+
### Below is some unfinished code
|
|
42
|
+
# edge_list_outfile = open(in_name+'_edge_list.csv','w')
|
|
43
|
+
# for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
|
|
44
|
+
# output = []
|
|
45
|
+
# for entry in sequences:
|
|
46
|
+
# # Split each entry at '|'
|
|
47
|
+
# genome, gene = entry.split('|')
|
|
48
|
+
# # Format the result as "gene genome"
|
|
49
|
+
# output.append(f"{gene}\t{genome}")
|
|
50
|
+
# for line in output:
|
|
51
|
+
# edge_list_outfile.write(line + '\n')
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def get_cores(options,genome_dict):
|
|
57
|
+
##Calculate core groups
|
|
58
|
+
groups = OrderedDict()
|
|
59
|
+
cores = OrderedDict()
|
|
60
|
+
prev_top = len(genome_dict)
|
|
61
|
+
first = True
|
|
62
|
+
for group in options.species_groups.split(','):
|
|
63
|
+
calculated_floor = math.floor(int(group) / 100 * len(genome_dict))
|
|
64
|
+
if first == False:
|
|
65
|
+
groups[group] = (calculated_floor,prev_top)
|
|
66
|
+
else:
|
|
67
|
+
groups[group] = (calculated_floor, prev_top)
|
|
68
|
+
first = False
|
|
69
|
+
prev_top = calculated_floor
|
|
70
|
+
first_core_group = 'First_core_' + group
|
|
71
|
+
cores[first_core_group] = []
|
|
72
|
+
if options.reclustered != None:
|
|
73
|
+
extended_core_group = 'extended_core_' + group
|
|
74
|
+
cores[extended_core_group] = []
|
|
75
|
+
combined_core_group = 'combined_core_' + group
|
|
76
|
+
cores[combined_core_group] = []
|
|
77
|
+
second_core_group = 'Second_core_' + group
|
|
78
|
+
cores[second_core_group] = []
|
|
79
|
+
only_second_core_group = 'only_Second_core_' + group
|
|
80
|
+
cores[only_second_core_group] = []
|
|
81
|
+
return cores, groups
|
|
82
|
+
|
|
83
|
+
#@profile
|
|
84
|
+
def calc_First_only_core(cluster, First_num, groups, cores):
|
|
85
|
+
groups_as_list = list(groups.values())
|
|
86
|
+
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= int(First_num) <= fir):
|
|
87
|
+
res = idx
|
|
88
|
+
family_group = list(groups)[res]
|
|
89
|
+
cores['First_core_'+family_group].append(cluster)
|
|
90
|
+
|
|
91
|
+
#@profile
|
|
92
|
+
def calc_single_First_extended_Second_only_core(cluster, First_num, groups, cores, Second_num): # Count gene families extended with StORFs
|
|
93
|
+
groups_as_list = list(groups.values())
|
|
94
|
+
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= First_num+Second_num <= fir):
|
|
95
|
+
res = idx
|
|
96
|
+
|
|
97
|
+
family_group = list(groups)[res]
|
|
98
|
+
cores['extended_core_' + family_group].append(cluster)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
#@profile
|
|
102
|
+
def calc_multi_First_extended_Second_only_core(cluster, First_num, groups, cores, Second_num): # Count seperately those gene families extended with StORF_Reporter but combined >1 PEP
|
|
103
|
+
groups_as_list = list(groups.values())
|
|
104
|
+
# Looping through the list to find the matching condition
|
|
105
|
+
for idx, (sec, fir) in enumerate(groups_as_list):
|
|
106
|
+
if sec <= First_num + Second_num <= fir:
|
|
107
|
+
res = idx
|
|
108
|
+
break
|
|
109
|
+
family_group = list(groups)[res]
|
|
110
|
+
cores['combined_core_' + family_group].append(cluster)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
#@profile
|
|
114
|
+
def calc_Second_only_core(cluster, Second_num, groups, cores):
|
|
115
|
+
groups_as_list = list(groups.values())
|
|
116
|
+
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= Second_num <= fir):
|
|
117
|
+
res = idx
|
|
118
|
+
family_group = list(groups)[res]
|
|
119
|
+
cores['Second_core_' + family_group].append(cluster)
|
|
120
|
+
|
|
121
|
+
#@profile
|
|
122
|
+
def calc_only_Second_only_core(cluster, Second_num, groups, cores): # only count the true storf onlies
|
|
123
|
+
groups_as_list = list(groups.values())
|
|
124
|
+
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= Second_num <= fir):
|
|
125
|
+
res = idx
|
|
126
|
+
family_group = list(groups)[res]
|
|
127
|
+
cores['only_Second_core_' + family_group].append(cluster)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
#@profile
|
|
132
|
+
def cluster(options):
|
|
133
|
+
|
|
134
|
+
if options.cluster_format == 'CD-HIT':
|
|
135
|
+
genome_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options, '|')
|
|
136
|
+
elif 'BLAST' in options.cluster_format:
|
|
137
|
+
genome_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_BLAST(options, '|')
|
|
138
|
+
elif 'MMseqs' in options.cluster_format:
|
|
139
|
+
genome_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_MMseqs(options, '|')
|
|
140
|
+
|
|
141
|
+
###
|
|
142
|
+
cores, groups = get_cores(options, genome_dict)
|
|
143
|
+
###
|
|
144
|
+
|
|
145
|
+
if options.reclustered != None: #FIX
|
|
146
|
+
if options.cluster_format == 'CD-HIT':
|
|
147
|
+
combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences = combined_clustering_CDHIT(options, genome_dict, '|')
|
|
148
|
+
elif 'TSV' in options.cluster_format or 'CSV' in options.cluster_format:
|
|
149
|
+
#Fix
|
|
150
|
+
combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second,combined_pangenome_clusters_Second_sequences = combined_clustering_Edge_List(options, '|')
|
|
151
|
+
pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered, pangenome_clusters_First_genomes, '|')
|
|
152
|
+
else:
|
|
153
|
+
pangenome_clusters_Type = single_clustering_counting(pangenome_clusters_First, reps)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
Number_Of_Second_Extending_But_Same_Genomes = 0
|
|
158
|
+
|
|
159
|
+
sorted_first_keys = sort_keys_by_values(pangenome_clusters_First, pangenome_clusters_First_sequences)
|
|
160
|
+
pangenome_clusters_First_sorted = reorder_dict_by_keys(pangenome_clusters_First, sorted_first_keys)
|
|
161
|
+
pangenome_clusters_First_sequences_sorted = reorder_dict_by_keys(pangenome_clusters_First_sequences, sorted_first_keys)
|
|
162
|
+
pangenome_clusters_Type_sorted = reorder_dict_by_keys(pangenome_clusters_Type, sorted_first_keys)
|
|
163
|
+
|
|
164
|
+
print("Calculating Groups")
|
|
165
|
+
seen_groupings = []
|
|
166
|
+
for cluster, numbers in pangenome_clusters_Type_sorted.items():
|
|
167
|
+
############################### Calculate First only
|
|
168
|
+
cluster = str(cluster)
|
|
169
|
+
for grouping in numbers[2]: #!!# Could do with a more elegant solution
|
|
170
|
+
current_cluster = grouping[0].split(':')[0]
|
|
171
|
+
if current_cluster not in seen_groupings:
|
|
172
|
+
seen_groupings.append(current_cluster)
|
|
173
|
+
current_cluster_size = grouping[0].split(':')[1]
|
|
174
|
+
calc_First_only_core(current_cluster, current_cluster_size,groups,cores)
|
|
175
|
+
############################# Calculate First and Reclustered-Second
|
|
176
|
+
if numbers[0] == 1 and numbers[3] >= 1: # If Seconds did not combine First reps
|
|
177
|
+
calc_single_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
|
|
178
|
+
elif numbers[0] > 1 and numbers[3] >= 1: # If unique Seconds combined multiple Firsts
|
|
179
|
+
calc_multi_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
|
|
180
|
+
elif numbers[4] >= 1:
|
|
181
|
+
Number_Of_Second_Extending_But_Same_Genomes += 1
|
|
182
|
+
else:
|
|
183
|
+
if options.verbose == True:
|
|
184
|
+
print("First cluster " + current_cluster + " already processed - This is likely because it was clustered with another First representative.")
|
|
185
|
+
|
|
186
|
+
if options.reclustered != None:
|
|
187
|
+
combined_pangenome_clusters_ONLY_Second_Type = defaultdict(list)
|
|
188
|
+
combined_pangenome_clusters_Second_Type = defaultdict(list)
|
|
189
|
+
for cluster, genomes in combined_pangenome_clusters_Second.items():
|
|
190
|
+
if cluster in not_Second_only_cluster_ids:
|
|
191
|
+
combined_pangenome_clusters_Second_Type[cluster] = [cluster, len(genomes)]
|
|
192
|
+
else:
|
|
193
|
+
combined_pangenome_clusters_ONLY_Second_Type[cluster] = [cluster, len(genomes)]
|
|
194
|
+
for cluster, data in combined_pangenome_clusters_Second_Type.items():
|
|
195
|
+
if data[1] >= 1:
|
|
196
|
+
calc_Second_only_core(cluster, data[1], groups, cores)
|
|
197
|
+
for cluster, data in combined_pangenome_clusters_ONLY_Second_Type.items():
|
|
198
|
+
if data[1] >= 1:
|
|
199
|
+
calc_only_Second_only_core(cluster, data[1], groups, cores)
|
|
200
|
+
###########################
|
|
201
|
+
### Output
|
|
202
|
+
output_path = os.path.abspath(options.output_dir)
|
|
203
|
+
if not os.path.exists(output_path):
|
|
204
|
+
os.makedirs(output_path)
|
|
205
|
+
stats_out = os.path.join(output_path,'summary_statistics.txt')
|
|
206
|
+
key_order = ['First_core_', 'extended_core_', 'combined_core_', 'Second_core_','only_Second_core_']
|
|
207
|
+
with open(stats_out, 'w') as outfile:
|
|
208
|
+
print("Number of Genomes: " + str(len(genome_dict)))
|
|
209
|
+
outfile.write("Number of Genomes: " + str(len(genome_dict)) + "\n")
|
|
210
|
+
print("Gene Groups:")
|
|
211
|
+
outfile.write("Gene Groups\n")
|
|
212
|
+
for key_prefix in key_order:
|
|
213
|
+
for key, value in cores.items():
|
|
214
|
+
if key.startswith(key_prefix):
|
|
215
|
+
print(f"{key}: {len(value)}")
|
|
216
|
+
outfile.write(f"{key}: {len(value)}\n")
|
|
217
|
+
print("Total Number of First Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
|
|
218
|
+
outfile.write("Total Number of First Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
|
|
219
|
+
if options.reclustered!= None:
|
|
220
|
+
print("Total Number of Second Gene Groups (Including Singletons): " + str(
|
|
221
|
+
len(combined_pangenome_clusters_Second_sequences)))
|
|
222
|
+
print("Total Number of First Gene Groups That Had Additional Second Sequences But Not New Genomes: " + str(
|
|
223
|
+
Number_Of_Second_Extending_But_Same_Genomes))
|
|
224
|
+
outfile.write("\nTotal Number of Second Gene Groups (Including Singletons): " + str(
|
|
225
|
+
len(combined_pangenome_clusters_Second_sequences)))
|
|
226
|
+
outfile.write("\nTotal Number of First Gene Groups That Had Additional Second Sequences But Not New Genomes: " + str(
|
|
227
|
+
Number_Of_Second_Extending_But_Same_Genomes))
|
|
228
|
+
#Report number of first and second clusters and do the ame for genus
|
|
229
|
+
if options.gene_presence_absence_out != False:
|
|
230
|
+
gene_presence_absence_output(options,genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted)
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
###Need to fix this below. If full/partial the ifs need to be different. If full we first need to output the gfs then align. if -wruite-groups not presented then it needs
|
|
234
|
+
# to be done for alignment full anyway...
|
|
235
|
+
|
|
236
|
+
genome_list = list(genome_dict.keys())
|
|
237
|
+
if options.run_mode == 'Full':
|
|
238
|
+
sequences = read_fasta(options.fasta)
|
|
239
|
+
if options.reclustered == None:
|
|
240
|
+
combined_pangenome_clusters_Second_sequences = None
|
|
241
|
+
## Output representative sequences
|
|
242
|
+
representatives_out = os.path.join(output_path,'pan_genome_reference.fa')
|
|
243
|
+
with open(representatives_out, 'w') as outfile:
|
|
244
|
+
for cluster, ids in pangenome_clusters_First_sequences.items():
|
|
245
|
+
outfile.write('>group_'+str(cluster)+'\n')
|
|
246
|
+
wrapped_aa_seq = wrap_sequence(sequences[ids[0]], 60)
|
|
247
|
+
outfile.write(wrapped_aa_seq+'\n')
|
|
248
|
+
if options.write_groups != None:
|
|
249
|
+
print("Outputting gene group FASTA files")
|
|
250
|
+
#output_dir = os.path.dirname(os.path.abspath(options.output_dir))
|
|
251
|
+
output_dir = os.path.join(options.output_dir, 'Gene_Groups_Output')
|
|
252
|
+
write_groups_func(options,output_dir, key_order, cores, sequences,
|
|
253
|
+
pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
|
|
254
|
+
|
|
255
|
+
if options.align_core != None:
|
|
256
|
+
print("Processing gene group alignment")
|
|
257
|
+
process_gene_groups(options, output_dir, None, None, genome_list, 'core_gene_alignment.aln')
|
|
258
|
+
|
|
259
|
+
elif options.run_mode == 'Partial':
|
|
260
|
+
sequences = read_fasta(options.fasta)
|
|
261
|
+
if options.reclustered == None:
|
|
262
|
+
combined_pangenome_clusters_Second_sequences = None
|
|
263
|
+
# else: ## Output representative sequences - Under development
|
|
264
|
+
# representatives_out = os.path.join(output_path, 'pan_genome_reference_reclustered.fa')
|
|
265
|
+
# with open(representatives_out, 'w') as outfile:
|
|
266
|
+
# for cluster, ids in combined_pangenome_clusters_Second_sequences.items():
|
|
267
|
+
# outfile.write('>group_' + str(cluster) + '\n')
|
|
268
|
+
# try:
|
|
269
|
+
# wrapped_aa_seq = wrap_sequence(sequences[ids[0]], 60)
|
|
270
|
+
# except:
|
|
271
|
+
# print(2)
|
|
272
|
+
# outfile.write(wrapped_aa_seq + '\n')
|
|
273
|
+
## Output representative sequences
|
|
274
|
+
representatives_out = os.path.join(output_path,'pan_genome_reference.fa')
|
|
275
|
+
with open(representatives_out, 'w') as outfile:
|
|
276
|
+
for cluster, ids in pangenome_clusters_First_sequences.items():
|
|
277
|
+
outfile.write('>group_'+str(cluster)+'\n')
|
|
278
|
+
wrapped_aa_seq = wrap_sequence(sequences[ids[0]], 60)
|
|
279
|
+
outfile.write(wrapped_aa_seq+'\n')
|
|
280
|
+
if options.write_groups != None:
|
|
281
|
+
print("Outputting gene group FASTA files")
|
|
282
|
+
output_dir = os.path.join(options.output_dir, 'Gene_Groups_Output')
|
|
283
|
+
write_groups_func(options,output_dir, key_order, cores, sequences,
|
|
284
|
+
pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
|
|
285
|
+
|
|
286
|
+
if options.align_core != None:
|
|
287
|
+
print("Processing gene group alignment")
|
|
288
|
+
process_gene_groups(options, output_dir, None, None, genome_list, 'core_gene_alignment.aln')
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
#
|
|
293
|
+
# if options.align_core != None:
|
|
294
|
+
# #output_dir = os.path.dirname(os.path.abspath(options.output_dir))
|
|
295
|
+
# output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
|
|
296
|
+
# if not os.path.exists(output_dir):
|
|
297
|
+
# os.makedirs(output_dir)
|
|
298
|
+
# process_gene_families(options, output_dir, 'concatenated_genes_aligned.fasta')
|
|
299
|
+
|
|
300
|
+
#
|
|
301
|
+
# elif options.run_mode == 'Partial':
|
|
302
|
+
# if options.align_core != None and options.fasta != None and options.write_groups != None:
|
|
303
|
+
# process_gene_families(options, os.path.join(output_dir, 'Gene_Families_Output'), 'concatenated_genes_aligned.fasta')
|
|
304
|
+
#
|
|
305
|
+
#
|
|
306
|
+
#
|
|
307
|
+
#
|
|
308
|
+
#
|
|
309
|
+
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
try:
|
|
5
|
+
from .constants import *
|
|
6
|
+
from .utils import *
|
|
7
|
+
except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
|
|
8
|
+
from constants import *
|
|
9
|
+
from utils import *
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def main():
|
|
14
|
+
parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': Seq-Combiner - A tool to extract sequences from GFF/FASTA files and prepare them for PyamilySeq.')
|
|
15
|
+
### Required Arguments
|
|
16
|
+
required = parser.add_argument_group('Required Arguments')
|
|
17
|
+
required.add_argument('-input_dir', action='store', dest='input_dir',
|
|
18
|
+
help='Directory location where the files are located.',
|
|
19
|
+
required=True)
|
|
20
|
+
required.add_argument('-input_type', action='store', dest='input_type', choices=['separate', 'combined', 'fasta'],
|
|
21
|
+
help='Type of input files: "separate" for separate FASTA and GFF files,'
|
|
22
|
+
' "combined" for GFF files with embedded FASTA sequences and "fasta" for combining multiple '
|
|
23
|
+
'FASTA files together.',
|
|
24
|
+
required=True)
|
|
25
|
+
required.add_argument("-name_split_gff", action="store", dest="name_split_gff",
|
|
26
|
+
help="Substring used to split the filename and extract the genome name ('_combined.gff3' or '.gff'). - Not needed with -input_type fasta",
|
|
27
|
+
required=False)
|
|
28
|
+
required.add_argument("-name_split_fasta", action="store", dest="name_split_fasta",
|
|
29
|
+
help="Substring used to split filenames and extract genome names for fasta files if named differently to paired gff files (e.g., '_dna.fasta').",
|
|
30
|
+
required=False)
|
|
31
|
+
required.add_argument("-output_dir", action="store", dest="output_dir",
|
|
32
|
+
help="Directory for all output files.",
|
|
33
|
+
required=True)
|
|
34
|
+
required.add_argument("-output_name", action="store", dest="output_file",
|
|
35
|
+
help="Output file name.",
|
|
36
|
+
required=True)
|
|
37
|
+
|
|
38
|
+
optional = parser.add_argument_group('Optional Arguments')
|
|
39
|
+
optional.add_argument('-gene_ident', action='store', dest='gene_ident', default='CDS',
|
|
40
|
+
help='Default - "CDS": Identifier used for extraction of sequences such as "misc_RNA,gene,mRNA,CDS,rRNA,tRNA,tmRNA,CRISPR,ncRNA,regulatory_region,oriC,pseudo"'
|
|
41
|
+
' - Not compatible with "fasta" input mode.',
|
|
42
|
+
required=False)
|
|
43
|
+
optional.add_argument('-translate', action='store_true', dest='translate', default=None,
|
|
44
|
+
help='Default - False: Translate extracted sequences to their AA counterpart? - appends _aa.fasta to given output_name',
|
|
45
|
+
required=False)
|
|
46
|
+
misc = parser.add_argument_group('Misc Arguments')
|
|
47
|
+
misc.add_argument("-v", "--version", action="version",
|
|
48
|
+
version=f"PyamilySeq: Seq-Combiner version {PyamilySeq_Version} - Exiting",
|
|
49
|
+
help="Print out version number and exit")
|
|
50
|
+
|
|
51
|
+
options = parser.parse_args()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
if options.input_type == 'separate' and options.name_split_gff is None:
|
|
55
|
+
print("Please provide a substring to split the filename and extract the genome name.")
|
|
56
|
+
exit(1)
|
|
57
|
+
if options.input_type == 'combined' and options.name_split_gff is None:
|
|
58
|
+
print("Please provide a substring to split the filename and extract the genome name.")
|
|
59
|
+
exit(1)
|
|
60
|
+
if options.input_type == 'fasta' and options.name_split_fasta is None:
|
|
61
|
+
print("Please provide a substring to split the filename and extract the genome name.")
|
|
62
|
+
exit
|
|
63
|
+
|
|
64
|
+
output_path = os.path.abspath(options.output_dir)
|
|
65
|
+
if not os.path.exists(output_path):
|
|
66
|
+
os.makedirs(output_path)
|
|
67
|
+
|
|
68
|
+
output_file = options.output_file + '.fasta'
|
|
69
|
+
if os.path.exists(os.path.join(output_path, output_file)):
|
|
70
|
+
print(f"Output file {output_file} already exists in the output directory. Please delete or rename the file and try again.")
|
|
71
|
+
exit(1)
|
|
72
|
+
|
|
73
|
+
combined_out_file = os.path.join(output_path, output_file )
|
|
74
|
+
|
|
75
|
+
if options.input_type == 'separate':
|
|
76
|
+
read_separate_files(options.input_dir, options.name_split_gff, options.name_split_fasta, options.gene_ident, combined_out_file, options.translate)
|
|
77
|
+
elif options.input_type == 'combined':
|
|
78
|
+
read_combined_files(options.input_dir, options.name_split_gff, options.gene_ident, combined_out_file, options.translate)
|
|
79
|
+
elif options.input_type == 'fasta':
|
|
80
|
+
read_fasta_files(options.input_dir, options.name_split_fasta, combined_out_file, options.translate)
|
|
81
|
+
|
|
82
|
+
if __name__ == "__main__":
|
|
83
|
+
main()
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import copy
|
|
3
|
+
|
|
4
|
+
def find_gene_ids_in_csv(csv_file, group_name):
|
|
5
|
+
"""Find gene IDs associated with the specified group name in the CSV file, starting from column 14."""
|
|
6
|
+
gene_ids = []
|
|
7
|
+
with open(csv_file, 'r') as f:
|
|
8
|
+
for line in f:
|
|
9
|
+
cells = line.strip().split(',')
|
|
10
|
+
if cells[0].replace('"','') == group_name:
|
|
11
|
+
# Collect gene IDs from column 14 onward
|
|
12
|
+
for cell in cells[14:]:
|
|
13
|
+
gene_ids.extend(cell.strip().replace('"','').split()) # Splitting by spaces if there are multiple IDs in a cell break
|
|
14
|
+
return gene_ids
|
|
15
|
+
|
|
16
|
+
def extract_sequences(fasta_file, gene_ids):
|
|
17
|
+
"""Extract sequences from the FASTA file that match the gene IDs."""
|
|
18
|
+
sequences = {}
|
|
19
|
+
capture = False
|
|
20
|
+
current_id = ""
|
|
21
|
+
not_found = copy.deepcopy(gene_ids)
|
|
22
|
+
with open(fasta_file, 'r') as f:
|
|
23
|
+
for line in f:
|
|
24
|
+
if line.startswith('>'):
|
|
25
|
+
# Extract the ID part after '>' and check if it's in gene_ids
|
|
26
|
+
current_id = line[1:].strip().split()[0].split('|')[1]
|
|
27
|
+
capture = current_id in gene_ids
|
|
28
|
+
if current_id in not_found:
|
|
29
|
+
not_found.remove(current_id)
|
|
30
|
+
if capture:
|
|
31
|
+
sequences[current_id] = [line.strip()] # Start with header line
|
|
32
|
+
elif capture:
|
|
33
|
+
sequences[current_id].append(line.strip()) # Append sequence lines
|
|
34
|
+
return sequences
|
|
35
|
+
|
|
36
|
+
def main():
|
|
37
|
+
parser = argparse.ArgumentParser(description="Extract sequences for specified group name from CSV file and corresponding FASTA file.")
|
|
38
|
+
parser.add_argument("-csv", action='store', dest='csv_file',
|
|
39
|
+
help="CSV file containing group data", required=True)
|
|
40
|
+
parser.add_argument("-group", action='store', dest='group_name',
|
|
41
|
+
help="Group name to search for in the CSV", required=True)
|
|
42
|
+
parser.add_argument("-fasta", action='store', dest='fasta_file',
|
|
43
|
+
help="Input FASTA file containing sequences", required=True)
|
|
44
|
+
parser.add_argument("-out", action='store', dest='output_file',
|
|
45
|
+
help="Output FASTA file with extracted sequences", required=True)
|
|
46
|
+
|
|
47
|
+
options = parser.parse_args()
|
|
48
|
+
|
|
49
|
+
# Find gene IDs in CSV
|
|
50
|
+
gene_ids = find_gene_ids_in_csv(options.csv_file, options.group_name)
|
|
51
|
+
if not gene_ids:
|
|
52
|
+
print(f"No gene IDs found for group name '{options.group_name}' in the CSV.")
|
|
53
|
+
return
|
|
54
|
+
|
|
55
|
+
# Extract sequences from the FASTA file
|
|
56
|
+
sequences = extract_sequences(options.fasta_file, gene_ids)
|
|
57
|
+
|
|
58
|
+
# Write matched sequences to the output FASTA file
|
|
59
|
+
with open(options.output_file, 'w') as output:
|
|
60
|
+
for gene_id, sequence_lines in sequences.items():
|
|
61
|
+
output.write("\n".join(sequence_lines) + "\n")
|
|
62
|
+
|
|
63
|
+
if __name__ == "__main__":
|
|
64
|
+
main()
|
PyamilySeq/Seq_Finder.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import collections
|
|
3
|
+
import csv
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def parse_fasta_ids(fasta_file):
|
|
7
|
+
"""Extract IDs from the FASTA file."""
|
|
8
|
+
ids = []
|
|
9
|
+
with open(fasta_file, 'r') as f:
|
|
10
|
+
for line in f:
|
|
11
|
+
if line.startswith('>'):
|
|
12
|
+
seq_id = line[1:].strip().split()[0] # Capture the ID after '>'
|
|
13
|
+
ids.append(seq_id)
|
|
14
|
+
return ids
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def find_ids_in_csv(ids, csv_file):
|
|
18
|
+
"""Search for each ID in the CSV file and report the first column where it is found."""
|
|
19
|
+
found_records = collections.defaultdict(list)
|
|
20
|
+
with open(csv_file, 'r') as f:
|
|
21
|
+
csv_reader = csv.reader(f)
|
|
22
|
+
for row in csv_reader:
|
|
23
|
+
if row: # Ensure row is not empty
|
|
24
|
+
|
|
25
|
+
for id in ids: # slow
|
|
26
|
+
if id in row:
|
|
27
|
+
found_records[row[0]].append(id)
|
|
28
|
+
return found_records
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def main():
|
|
32
|
+
parser = argparse.ArgumentParser(description="Extract IDs from a FASTA file and search for them in a CSV file.")
|
|
33
|
+
parser.add_argument("-in", action='store', dest='fasta_file',
|
|
34
|
+
help="Input FASTA file", required=True)
|
|
35
|
+
parser.add_argument("-ids", action='store', dest='csv_file',
|
|
36
|
+
help="CSV file containing IDs to search for", required=True)
|
|
37
|
+
parser.add_argument("-out", action='store', dest='output_file',
|
|
38
|
+
help="Output file to save found IDs", required=True)
|
|
39
|
+
|
|
40
|
+
options = parser.parse_args()
|
|
41
|
+
|
|
42
|
+
# Parse IDs from the FASTA file
|
|
43
|
+
ids = parse_fasta_ids(options.fasta_file)
|
|
44
|
+
|
|
45
|
+
# Find IDs in the CSV file
|
|
46
|
+
found_records = find_ids_in_csv(ids, options.csv_file)
|
|
47
|
+
|
|
48
|
+
# Write output
|
|
49
|
+
with open(options.output_file, 'w') as output:
|
|
50
|
+
output.write("ID,Found_In_First_Column\n")
|
|
51
|
+
for seq_id, found_in in found_records.items():
|
|
52
|
+
output.write(f"{seq_id},{found_in}\n")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
if __name__ == "__main__":
|
|
56
|
+
main()
|
PyamilySeq/__init__.py
ADDED
|
File without changes
|