PyamilySeq 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,309 @@
1
+
2
+ try:
3
+ from .constants import *
4
+ from .clusterings import *
5
+ from .utils import *
6
+ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
7
+ from constants import *
8
+ from clusterings import *
9
+ from utils import *
10
+
11
+
12
+ def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
13
+ print("Outputting gene_presence_absence file")
14
+ output_dir = os.path.abspath(options.output_dir)
15
+ #in_name = options.clusters.split('.')[0].split('/')[-1]
16
+ gpa_outfile = os.path.join(output_dir, 'gene_presence_absence.csv')
17
+ gpa_outfile = open(gpa_outfile, 'w')
18
+ gpa_outfile.write('"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","'
19
+ '"Accessory Fragment","Accessory Order with Fragment","QC","Min group size nuc","Max group size nuc","Avg group size nuc","')
20
+ gpa_outfile.write('","'.join(genome_dict.keys()))
21
+ gpa_outfile.write('"\n')
22
+ for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
23
+ average_sequences_per_genome = len(sequences) / len(pangenome_clusters_First_sorted[cluster])
24
+ gpa_outfile.write('"group_'+str(cluster)+'","","",'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
25
+ '","","","","","","","","",""')
26
+
27
+
28
+ for genome in genome_dict.keys():
29
+ full_out = ''
30
+ tmp_list = []
31
+ for value in sequences:
32
+ if value.split('|')[0] == genome:
33
+ tmp_list.append(value.split('|')[1])
34
+ if tmp_list:
35
+ full_out += ',"'+'\t'.join(tmp_list)+'"'
36
+ else:
37
+ full_out = ',""'
38
+ gpa_outfile.write(full_out)
39
+ gpa_outfile.write('\n')
40
+
41
+ ### Below is some unfinished code
42
+ # edge_list_outfile = open(in_name+'_edge_list.csv','w')
43
+ # for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
44
+ # output = []
45
+ # for entry in sequences:
46
+ # # Split each entry at '|'
47
+ # genome, gene = entry.split('|')
48
+ # # Format the result as "gene genome"
49
+ # output.append(f"{gene}\t{genome}")
50
+ # for line in output:
51
+ # edge_list_outfile.write(line + '\n')
52
+
53
+
54
+
55
+
56
+ def get_cores(options,genome_dict):
57
+ ##Calculate core groups
58
+ groups = OrderedDict()
59
+ cores = OrderedDict()
60
+ prev_top = len(genome_dict)
61
+ first = True
62
+ for group in options.species_groups.split(','):
63
+ calculated_floor = math.floor(int(group) / 100 * len(genome_dict))
64
+ if first == False:
65
+ groups[group] = (calculated_floor,prev_top)
66
+ else:
67
+ groups[group] = (calculated_floor, prev_top)
68
+ first = False
69
+ prev_top = calculated_floor
70
+ first_core_group = 'First_core_' + group
71
+ cores[first_core_group] = []
72
+ if options.reclustered != None:
73
+ extended_core_group = 'extended_core_' + group
74
+ cores[extended_core_group] = []
75
+ combined_core_group = 'combined_core_' + group
76
+ cores[combined_core_group] = []
77
+ second_core_group = 'Second_core_' + group
78
+ cores[second_core_group] = []
79
+ only_second_core_group = 'only_Second_core_' + group
80
+ cores[only_second_core_group] = []
81
+ return cores, groups
82
+
83
+ #@profile
84
+ def calc_First_only_core(cluster, First_num, groups, cores):
85
+ groups_as_list = list(groups.values())
86
+ for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= int(First_num) <= fir):
87
+ res = idx
88
+ family_group = list(groups)[res]
89
+ cores['First_core_'+family_group].append(cluster)
90
+
91
+ #@profile
92
+ def calc_single_First_extended_Second_only_core(cluster, First_num, groups, cores, Second_num): # Count gene families extended with StORFs
93
+ groups_as_list = list(groups.values())
94
+ for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= First_num+Second_num <= fir):
95
+ res = idx
96
+
97
+ family_group = list(groups)[res]
98
+ cores['extended_core_' + family_group].append(cluster)
99
+
100
+
101
+ #@profile
102
+ def calc_multi_First_extended_Second_only_core(cluster, First_num, groups, cores, Second_num): # Count seperately those gene families extended with StORF_Reporter but combined >1 PEP
103
+ groups_as_list = list(groups.values())
104
+ # Looping through the list to find the matching condition
105
+ for idx, (sec, fir) in enumerate(groups_as_list):
106
+ if sec <= First_num + Second_num <= fir:
107
+ res = idx
108
+ break
109
+ family_group = list(groups)[res]
110
+ cores['combined_core_' + family_group].append(cluster)
111
+
112
+
113
+ #@profile
114
+ def calc_Second_only_core(cluster, Second_num, groups, cores):
115
+ groups_as_list = list(groups.values())
116
+ for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= Second_num <= fir):
117
+ res = idx
118
+ family_group = list(groups)[res]
119
+ cores['Second_core_' + family_group].append(cluster)
120
+
121
+ #@profile
122
+ def calc_only_Second_only_core(cluster, Second_num, groups, cores): # only count the true storf onlies
123
+ groups_as_list = list(groups.values())
124
+ for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= Second_num <= fir):
125
+ res = idx
126
+ family_group = list(groups)[res]
127
+ cores['only_Second_core_' + family_group].append(cluster)
128
+
129
+
130
+
131
+ #@profile
132
+ def cluster(options):
133
+
134
+ if options.cluster_format == 'CD-HIT':
135
+ genome_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options, '|')
136
+ elif 'BLAST' in options.cluster_format:
137
+ genome_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_BLAST(options, '|')
138
+ elif 'MMseqs' in options.cluster_format:
139
+ genome_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_MMseqs(options, '|')
140
+
141
+ ###
142
+ cores, groups = get_cores(options, genome_dict)
143
+ ###
144
+
145
+ if options.reclustered != None: #FIX
146
+ if options.cluster_format == 'CD-HIT':
147
+ combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences = combined_clustering_CDHIT(options, genome_dict, '|')
148
+ elif 'TSV' in options.cluster_format or 'CSV' in options.cluster_format:
149
+ #Fix
150
+ combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second,combined_pangenome_clusters_Second_sequences = combined_clustering_Edge_List(options, '|')
151
+ pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered, pangenome_clusters_First_genomes, '|')
152
+ else:
153
+ pangenome_clusters_Type = single_clustering_counting(pangenome_clusters_First, reps)
154
+
155
+
156
+
157
+ Number_Of_Second_Extending_But_Same_Genomes = 0
158
+
159
+ sorted_first_keys = sort_keys_by_values(pangenome_clusters_First, pangenome_clusters_First_sequences)
160
+ pangenome_clusters_First_sorted = reorder_dict_by_keys(pangenome_clusters_First, sorted_first_keys)
161
+ pangenome_clusters_First_sequences_sorted = reorder_dict_by_keys(pangenome_clusters_First_sequences, sorted_first_keys)
162
+ pangenome_clusters_Type_sorted = reorder_dict_by_keys(pangenome_clusters_Type, sorted_first_keys)
163
+
164
+ print("Calculating Groups")
165
+ seen_groupings = []
166
+ for cluster, numbers in pangenome_clusters_Type_sorted.items():
167
+ ############################### Calculate First only
168
+ cluster = str(cluster)
169
+ for grouping in numbers[2]: #!!# Could do with a more elegant solution
170
+ current_cluster = grouping[0].split(':')[0]
171
+ if current_cluster not in seen_groupings:
172
+ seen_groupings.append(current_cluster)
173
+ current_cluster_size = grouping[0].split(':')[1]
174
+ calc_First_only_core(current_cluster, current_cluster_size,groups,cores)
175
+ ############################# Calculate First and Reclustered-Second
176
+ if numbers[0] == 1 and numbers[3] >= 1: # If Seconds did not combine First reps
177
+ calc_single_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
178
+ elif numbers[0] > 1 and numbers[3] >= 1: # If unique Seconds combined multiple Firsts
179
+ calc_multi_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
180
+ elif numbers[4] >= 1:
181
+ Number_Of_Second_Extending_But_Same_Genomes += 1
182
+ else:
183
+ if options.verbose == True:
184
+ print("First cluster " + current_cluster + " already processed - This is likely because it was clustered with another First representative.")
185
+
186
+ if options.reclustered != None:
187
+ combined_pangenome_clusters_ONLY_Second_Type = defaultdict(list)
188
+ combined_pangenome_clusters_Second_Type = defaultdict(list)
189
+ for cluster, genomes in combined_pangenome_clusters_Second.items():
190
+ if cluster in not_Second_only_cluster_ids:
191
+ combined_pangenome_clusters_Second_Type[cluster] = [cluster, len(genomes)]
192
+ else:
193
+ combined_pangenome_clusters_ONLY_Second_Type[cluster] = [cluster, len(genomes)]
194
+ for cluster, data in combined_pangenome_clusters_Second_Type.items():
195
+ if data[1] >= 1:
196
+ calc_Second_only_core(cluster, data[1], groups, cores)
197
+ for cluster, data in combined_pangenome_clusters_ONLY_Second_Type.items():
198
+ if data[1] >= 1:
199
+ calc_only_Second_only_core(cluster, data[1], groups, cores)
200
+ ###########################
201
+ ### Output
202
+ output_path = os.path.abspath(options.output_dir)
203
+ if not os.path.exists(output_path):
204
+ os.makedirs(output_path)
205
+ stats_out = os.path.join(output_path,'summary_statistics.txt')
206
+ key_order = ['First_core_', 'extended_core_', 'combined_core_', 'Second_core_','only_Second_core_']
207
+ with open(stats_out, 'w') as outfile:
208
+ print("Number of Genomes: " + str(len(genome_dict)))
209
+ outfile.write("Number of Genomes: " + str(len(genome_dict)) + "\n")
210
+ print("Gene Groups:")
211
+ outfile.write("Gene Groups\n")
212
+ for key_prefix in key_order:
213
+ for key, value in cores.items():
214
+ if key.startswith(key_prefix):
215
+ print(f"{key}: {len(value)}")
216
+ outfile.write(f"{key}: {len(value)}\n")
217
+ print("Total Number of First Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
218
+ outfile.write("Total Number of First Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
219
+ if options.reclustered!= None:
220
+ print("Total Number of Second Gene Groups (Including Singletons): " + str(
221
+ len(combined_pangenome_clusters_Second_sequences)))
222
+ print("Total Number of First Gene Groups That Had Additional Second Sequences But Not New Genomes: " + str(
223
+ Number_Of_Second_Extending_But_Same_Genomes))
224
+ outfile.write("\nTotal Number of Second Gene Groups (Including Singletons): " + str(
225
+ len(combined_pangenome_clusters_Second_sequences)))
226
+ outfile.write("\nTotal Number of First Gene Groups That Had Additional Second Sequences But Not New Genomes: " + str(
227
+ Number_Of_Second_Extending_But_Same_Genomes))
228
+ #Report number of first and second clusters and do the ame for genus
229
+ if options.gene_presence_absence_out != False:
230
+ gene_presence_absence_output(options,genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted)
231
+
232
+
233
+ ###Need to fix this below. If full/partial the ifs need to be different. If full we first need to output the gfs then align. if -wruite-groups not presented then it needs
234
+ # to be done for alignment full anyway...
235
+
236
+ genome_list = list(genome_dict.keys())
237
+ if options.run_mode == 'Full':
238
+ sequences = read_fasta(options.fasta)
239
+ if options.reclustered == None:
240
+ combined_pangenome_clusters_Second_sequences = None
241
+ ## Output representative sequences
242
+ representatives_out = os.path.join(output_path,'pan_genome_reference.fa')
243
+ with open(representatives_out, 'w') as outfile:
244
+ for cluster, ids in pangenome_clusters_First_sequences.items():
245
+ outfile.write('>group_'+str(cluster)+'\n')
246
+ wrapped_aa_seq = wrap_sequence(sequences[ids[0]], 60)
247
+ outfile.write(wrapped_aa_seq+'\n')
248
+ if options.write_groups != None:
249
+ print("Outputting gene group FASTA files")
250
+ #output_dir = os.path.dirname(os.path.abspath(options.output_dir))
251
+ output_dir = os.path.join(options.output_dir, 'Gene_Groups_Output')
252
+ write_groups_func(options,output_dir, key_order, cores, sequences,
253
+ pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
254
+
255
+ if options.align_core != None:
256
+ print("Processing gene group alignment")
257
+ process_gene_groups(options, output_dir, None, None, genome_list, 'core_gene_alignment.aln')
258
+
259
+ elif options.run_mode == 'Partial':
260
+ sequences = read_fasta(options.fasta)
261
+ if options.reclustered == None:
262
+ combined_pangenome_clusters_Second_sequences = None
263
+ # else: ## Output representative sequences - Under development
264
+ # representatives_out = os.path.join(output_path, 'pan_genome_reference_reclustered.fa')
265
+ # with open(representatives_out, 'w') as outfile:
266
+ # for cluster, ids in combined_pangenome_clusters_Second_sequences.items():
267
+ # outfile.write('>group_' + str(cluster) + '\n')
268
+ # try:
269
+ # wrapped_aa_seq = wrap_sequence(sequences[ids[0]], 60)
270
+ # except:
271
+ # print(2)
272
+ # outfile.write(wrapped_aa_seq + '\n')
273
+ ## Output representative sequences
274
+ representatives_out = os.path.join(output_path,'pan_genome_reference.fa')
275
+ with open(representatives_out, 'w') as outfile:
276
+ for cluster, ids in pangenome_clusters_First_sequences.items():
277
+ outfile.write('>group_'+str(cluster)+'\n')
278
+ wrapped_aa_seq = wrap_sequence(sequences[ids[0]], 60)
279
+ outfile.write(wrapped_aa_seq+'\n')
280
+ if options.write_groups != None:
281
+ print("Outputting gene group FASTA files")
282
+ output_dir = os.path.join(options.output_dir, 'Gene_Groups_Output')
283
+ write_groups_func(options,output_dir, key_order, cores, sequences,
284
+ pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
285
+
286
+ if options.align_core != None:
287
+ print("Processing gene group alignment")
288
+ process_gene_groups(options, output_dir, None, None, genome_list, 'core_gene_alignment.aln')
289
+
290
+
291
+
292
+ #
293
+ # if options.align_core != None:
294
+ # #output_dir = os.path.dirname(os.path.abspath(options.output_dir))
295
+ # output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
296
+ # if not os.path.exists(output_dir):
297
+ # os.makedirs(output_dir)
298
+ # process_gene_families(options, output_dir, 'concatenated_genes_aligned.fasta')
299
+
300
+ #
301
+ # elif options.run_mode == 'Partial':
302
+ # if options.align_core != None and options.fasta != None and options.write_groups != None:
303
+ # process_gene_families(options, os.path.join(output_dir, 'Gene_Families_Output'), 'concatenated_genes_aligned.fasta')
304
+ #
305
+ #
306
+ #
307
+ #
308
+ #
309
+
@@ -0,0 +1,66 @@
1
+ import argparse
2
+
3
+
4
+ try:
5
+ from .constants import *
6
+ from .utils import *
7
+ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
8
+ from constants import *
9
+ from utils import *
10
+
11
+
12
+
13
+ def main():
14
+ parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': Seq-Combiner - A tool to extract sequences from GFF/FASTA files and prepare them for PyamilySeq.')
15
+ ### Required Arguments
16
+ required = parser.add_argument_group('Required Arguments')
17
+ required.add_argument('-input_dir', action='store', dest='input_dir',
18
+ help='Directory location where the files are located.',
19
+ required=True)
20
+ required.add_argument('-input_type', action='store', dest='input_type', choices=['separate', 'combined', 'fasta'],
21
+ help='Type of input files: "separate" for separate FASTA and GFF files,'
22
+ ' "combined" for GFF files with embedded FASTA sequences and "fasta" for combining multiple '
23
+ 'FASTA files together.',
24
+ required=True)
25
+ required.add_argument("-name_split", action="store", dest="name_split",
26
+ help="substring used to split the filename and extract the genome name ('_combined.gff3' or '.gff').",
27
+ required=True)
28
+ required.add_argument("-output_dir", action="store", dest="output_dir",
29
+ help="Directory for all output files.",
30
+ required=True)
31
+ required.add_argument("-output_name", action="store", dest="output_file",
32
+ help="Output file name (without .fasta).",
33
+ required=True)
34
+
35
+ optional = parser.add_argument_group('Optional Arguments')
36
+ optional.add_argument('-gene_ident', action='store', dest='gene_ident', default='CDS',
37
+ help='Default - "CDS": Identifier used for extraction of sequences such as "misc_RNA,gene,mRNA,CDS,rRNA,tRNA,tmRNA,CRISPR,ncRNA,regulatory_region,oriC,pseudo"'
38
+ ' - Not compatible with "fasta" input mode.',
39
+ required=False)
40
+ optional.add_argument('-translate', action='store_true', dest='translate', default=None,
41
+ help='Default - False: Translate extracted sequences to their AA counterpart? - appends _aa.fasta to given output_name',
42
+ required=False)
43
+ misc = parser.add_argument_group('Misc Arguments')
44
+ misc.add_argument("-v", "--version", action="version",
45
+ version=f"PyamilySeq: Seq-Combiner version {PyamilySeq_Version} - Exiting",
46
+ help="Print out version number and exit")
47
+
48
+ options = parser.parse_args()
49
+
50
+
51
+
52
+ output_path = os.path.abspath(options.output_dir)
53
+ if not os.path.exists(output_path):
54
+ os.makedirs(output_path)
55
+
56
+ combined_out_file = os.path.join(output_path, options.output_file + '.fasta')
57
+
58
+ if options.input_type == 'separate':
59
+ read_separate_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, options.translate)
60
+ elif options.input_type == 'combined':
61
+ read_combined_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, options.translate)
62
+ elif options.input_type == 'fasta':
63
+ read_fasta_files(options.input_dir, options.name_split, combined_out_file, options.translate)
64
+
65
+ if __name__ == "__main__":
66
+ main()
@@ -0,0 +1,64 @@
1
+ import argparse
2
+ import copy
3
+
4
+ def find_gene_ids_in_csv(csv_file, group_name):
5
+ """Find gene IDs associated with the specified group name in the CSV file, starting from column 14."""
6
+ gene_ids = []
7
+ with open(csv_file, 'r') as f:
8
+ for line in f:
9
+ cells = line.strip().split(',')
10
+ if cells[0].replace('"','') == group_name:
11
+ # Collect gene IDs from column 14 onward
12
+ for cell in cells[14:]:
13
+ gene_ids.extend(cell.strip().replace('"','').split()) # Splitting by spaces if there are multiple IDs in a cell break
14
+ return gene_ids
15
+
16
+ def extract_sequences(fasta_file, gene_ids):
17
+ """Extract sequences from the FASTA file that match the gene IDs."""
18
+ sequences = {}
19
+ capture = False
20
+ current_id = ""
21
+ not_found = copy.deepcopy(gene_ids)
22
+ with open(fasta_file, 'r') as f:
23
+ for line in f:
24
+ if line.startswith('>'):
25
+ # Extract the ID part after '>' and check if it's in gene_ids
26
+ current_id = line[1:].strip().split()[0].split('|')[1]
27
+ capture = current_id in gene_ids
28
+ if current_id in not_found:
29
+ not_found.remove(current_id)
30
+ if capture:
31
+ sequences[current_id] = [line.strip()] # Start with header line
32
+ elif capture:
33
+ sequences[current_id].append(line.strip()) # Append sequence lines
34
+ return sequences
35
+
36
+ def main():
37
+ parser = argparse.ArgumentParser(description="Extract sequences for specified group name from CSV file and corresponding FASTA file.")
38
+ parser.add_argument("-csv", action='store', dest='csv_file',
39
+ help="CSV file containing group data", required=True)
40
+ parser.add_argument("-group", action='store', dest='group_name',
41
+ help="Group name to search for in the CSV", required=True)
42
+ parser.add_argument("-fasta", action='store', dest='fasta_file',
43
+ help="Input FASTA file containing sequences", required=True)
44
+ parser.add_argument("-out", action='store', dest='output_file',
45
+ help="Output FASTA file with extracted sequences", required=True)
46
+
47
+ options = parser.parse_args()
48
+
49
+ # Find gene IDs in CSV
50
+ gene_ids = find_gene_ids_in_csv(options.csv_file, options.group_name)
51
+ if not gene_ids:
52
+ print(f"No gene IDs found for group name '{options.group_name}' in the CSV.")
53
+ return
54
+
55
+ # Extract sequences from the FASTA file
56
+ sequences = extract_sequences(options.fasta_file, gene_ids)
57
+
58
+ # Write matched sequences to the output FASTA file
59
+ with open(options.output_file, 'w') as output:
60
+ for gene_id, sequence_lines in sequences.items():
61
+ output.write("\n".join(sequence_lines) + "\n")
62
+
63
+ if __name__ == "__main__":
64
+ main()
@@ -0,0 +1,56 @@
1
+ import argparse
2
+ import collections
3
+ import csv
4
+
5
+
6
+ def parse_fasta_ids(fasta_file):
7
+ """Extract IDs from the FASTA file."""
8
+ ids = []
9
+ with open(fasta_file, 'r') as f:
10
+ for line in f:
11
+ if line.startswith('>'):
12
+ seq_id = line[1:].strip().split()[0] # Capture the ID after '>'
13
+ ids.append(seq_id)
14
+ return ids
15
+
16
+
17
+ def find_ids_in_csv(ids, csv_file):
18
+ """Search for each ID in the CSV file and report the first column where it is found."""
19
+ found_records = collections.defaultdict(list)
20
+ with open(csv_file, 'r') as f:
21
+ csv_reader = csv.reader(f)
22
+ for row in csv_reader:
23
+ if row: # Ensure row is not empty
24
+
25
+ for id in ids: # slow
26
+ if id in row:
27
+ found_records[row[0]].append(id)
28
+ return found_records
29
+
30
+
31
+ def main():
32
+ parser = argparse.ArgumentParser(description="Extract IDs from a FASTA file and search for them in a CSV file.")
33
+ parser.add_argument("-in", action='store', dest='fasta_file',
34
+ help="Input FASTA file", required=True)
35
+ parser.add_argument("-ids", action='store', dest='csv_file',
36
+ help="CSV file containing IDs to search for", required=True)
37
+ parser.add_argument("-out", action='store', dest='output_file',
38
+ help="Output file to save found IDs", required=True)
39
+
40
+ options = parser.parse_args()
41
+
42
+ # Parse IDs from the FASTA file
43
+ ids = parse_fasta_ids(options.fasta_file)
44
+
45
+ # Find IDs in the CSV file
46
+ found_records = find_ids_in_csv(ids, options.csv_file)
47
+
48
+ # Write output
49
+ with open(options.output_file, 'w') as output:
50
+ output.write("ID,Found_In_First_Column\n")
51
+ for seq_id, found_in in found_records.items():
52
+ output.write(f"{seq_id},{found_in}\n")
53
+
54
+
55
+ if __name__ == "__main__":
56
+ main()
PyamilySeq/__init__.py ADDED
File without changes