PyamilySeq 0.9.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,287 +0,0 @@
1
- #from line_profiler_pycharm import profile
2
-
3
- import math
4
-
5
- try:
6
- from .Constants import *
7
- from .clusterings import *
8
- from .utils import *
9
- except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
10
- from Constants import *
11
- from clusterings import *
12
- from utils import *
13
-
14
-
15
- #def output_fasta(options, gene_families):
16
-
17
- def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
18
- print("Outputting gene_presence_absence file")
19
- output_dir = os.path.abspath(options.output_dir)
20
- in_name = options.clusters.split('.')[0].split('/')[-1]
21
- gpa_outfile = os.path.join(output_dir, in_name)
22
- gpa_outfile = open(gpa_outfile+'_gene_presence_absence.csv','w')
23
- gpa_outfile.write('"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","'
24
- '"Accessory Fragment","Accessory Order with Fragment","QC","Min group size nuc","Max group size nuc","Avg group size nuc","')
25
- gpa_outfile.write('","'.join(genome_dict.keys()))
26
- gpa_outfile.write('"\n')
27
- for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
28
- average_sequences_per_genome = len(sequences) / len(pangenome_clusters_First_sorted[cluster])
29
- gpa_outfile.write('"group_'+str(cluster)+'","","'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
30
- '","","","","","","","","",""')
31
-
32
-
33
- for genome in genome_dict.keys():
34
- full_out = ''
35
- tmp_list = []
36
- for value in sequences:
37
- if value.split('|')[0] == genome:
38
- tmp_list.append(value)
39
- if tmp_list:
40
- full_out += ',"'+''.join(tmp_list)+'"'
41
- else:
42
- full_out = ',""'
43
- gpa_outfile.write(full_out)
44
- gpa_outfile.write('\n')
45
-
46
- ### Below is some unfinished code
47
- # edge_list_outfile = open(in_name+'_edge_list.csv','w')
48
- # for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
49
- # output = []
50
- # for entry in sequences:
51
- # # Split each entry at '|'
52
- # genome, gene = entry.split('|')
53
- # # Format the result as "gene genome"
54
- # output.append(f"{gene}\t{genome}")
55
- # for line in output:
56
- # edge_list_outfile.write(line + '\n')
57
-
58
-
59
-
60
-
61
- def get_cores(options,genome_dict):
62
- ##Calculate core groups
63
- groups = OrderedDict()
64
- cores = OrderedDict()
65
- prev_top = len(genome_dict)
66
- first = True
67
- for group in options.core_groups.split(','):
68
- calculated_floor = math.floor(int(group) / 100 * len(genome_dict))
69
- if first == False:
70
- groups[group] = (calculated_floor,prev_top)
71
- else:
72
- groups[group] = (calculated_floor, prev_top)
73
- first = False
74
- prev_top = calculated_floor
75
- first_core_group = 'First_core_' + group
76
- cores[first_core_group] = []
77
- if options.reclustered != None:
78
- extended_core_group = 'extended_core_' + group
79
- cores[extended_core_group] = []
80
- combined_core_group = 'combined_core_' + group
81
- cores[combined_core_group] = []
82
- second_core_group = 'Second_core_' + group
83
- cores[second_core_group] = []
84
- only_second_core_group = 'only_Second_core_' + group
85
- cores[only_second_core_group] = []
86
- return cores, groups
87
-
88
- #@profile
89
- def calc_First_only_core(cluster, First_num, groups, cores):
90
- groups_as_list = list(groups.values())
91
- for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= int(First_num) <= fir):
92
- res = idx
93
- family_group = list(groups)[res]
94
- cores['First_core_'+family_group].append(cluster)
95
-
96
- #@profile
97
- def calc_single_First_extended_Second_only_core(cluster, First_num, groups, cores, Second_num): # Count gene families extended with StORFs
98
- groups_as_list = list(groups.values())
99
- for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= First_num+Second_num <= fir):
100
- res = idx
101
-
102
- family_group = list(groups)[res]
103
- cores['extended_core_' + family_group].append(cluster)
104
-
105
-
106
- #@profile
107
- def calc_multi_First_extended_Second_only_core(cluster, First_num, groups, cores, Second_num): # Count seperately those gene families extended with StORF_Reporter but combined >1 PEP
108
- groups_as_list = list(groups.values())
109
- # Looping through the list to find the matching condition
110
- for idx, (sec, fir) in enumerate(groups_as_list):
111
- if sec <= First_num + Second_num <= fir:
112
- res = idx
113
- break
114
- family_group = list(groups)[res]
115
- cores['combined_core_' + family_group].append(cluster)
116
-
117
-
118
- #@profile
119
- def calc_Second_only_core(cluster, Second_num, groups, cores):
120
- groups_as_list = list(groups.values())
121
- for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= Second_num <= fir):
122
- res = idx
123
- family_group = list(groups)[res]
124
- cores['Second_core_' + family_group].append(cluster)
125
-
126
- #@profile
127
- def calc_only_Second_only_core(cluster, Second_num, groups, cores): # only count the true storf onlies
128
- groups_as_list = list(groups.values())
129
- for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= Second_num <= fir):
130
- res = idx
131
- family_group = list(groups)[res]
132
- cores['only_Second_core_' + family_group].append(cluster)
133
-
134
-
135
-
136
- #@profile
137
- def cluster(options):
138
-
139
- if options.cluster_format == 'CD-HIT':
140
- genome_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options, '|')
141
- elif 'TSV' in options.cluster_format or 'CSV' in options.cluster_format:
142
- genome_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_EdgeList(options, '|')
143
-
144
- ###
145
- cores, groups = get_cores(options, genome_dict)
146
- ###
147
-
148
- if options.reclustered != None:
149
- if options.cluster_format == 'CD-HIT':
150
- combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences = combined_clustering_CDHIT(options, genome_dict, '|')
151
- elif 'TSV' in options.cluster_format or 'CSV' in options.cluster_format:
152
- #Fix
153
- combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second,combined_pangenome_clusters_Second_sequences = combined_clustering_Edge_List(options, '|')
154
- pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered, pangenome_clusters_First_genomes, '|')
155
- else:
156
- pangenome_clusters_Type = single_clustering_counting(pangenome_clusters_First, reps)
157
-
158
-
159
-
160
- Number_Of_Second_Extending_But_Same_Genomes = 0
161
-
162
- sorted_first_keys = sort_keys_by_values(pangenome_clusters_First, pangenome_clusters_First_sequences)
163
- pangenome_clusters_First_sorted = reorder_dict_by_keys(pangenome_clusters_First, sorted_first_keys)
164
- pangenome_clusters_First_sequences_sorted = reorder_dict_by_keys(pangenome_clusters_First_sequences, sorted_first_keys)
165
- pangenome_clusters_Type_sorted = reorder_dict_by_keys(pangenome_clusters_Type, sorted_first_keys)
166
-
167
- print("Calculating Groups")
168
- seen_groupings = []
169
- for cluster, numbers in pangenome_clusters_Type_sorted.items():
170
- ############################### Calculate First only
171
- cluster = str(cluster)
172
- if '78' in cluster:
173
- pass
174
- for grouping in numbers[2]: #!!# Could do with a more elegant solution
175
- current_cluster = grouping[0].split(':')[0]
176
- if current_cluster not in seen_groupings:
177
- seen_groupings.append(current_cluster)
178
- current_cluster_size = grouping[0].split(':')[1]
179
- calc_First_only_core(current_cluster, current_cluster_size,groups,cores)
180
- ############################# Calculate First and Reclustered-Second
181
- if numbers[0] == 1 and numbers[3] >= 1: # If Seconds did not combine First reps
182
- calc_single_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
183
- elif numbers[0] > 1 and numbers[3] >= 1: # If unique Seconds combined multiple Firsts
184
- calc_multi_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
185
- elif numbers[4] >= 1:
186
- Number_Of_Second_Extending_But_Same_Genomes += 1
187
- else:
188
- if options.verbose == True:
189
- print("First cluster " + current_cluster + " already processed - This is likely because it was clustered with another First representative.")
190
-
191
- if options.reclustered != None:
192
- combined_pangenome_clusters_ONLY_Second_Type = defaultdict(list)
193
- combined_pangenome_clusters_Second_Type = defaultdict(list)
194
- for cluster, genomes in combined_pangenome_clusters_Second.items():
195
- if cluster in not_Second_only_cluster_ids:
196
- combined_pangenome_clusters_Second_Type[cluster] = [cluster, len(genomes)]
197
- else:
198
- combined_pangenome_clusters_ONLY_Second_Type[cluster] = [cluster, len(genomes)]
199
- for cluster, data in combined_pangenome_clusters_Second_Type.items():
200
- if data[1] >= 1:
201
- calc_Second_only_core(cluster, data[1], groups, cores)
202
- for cluster, data in combined_pangenome_clusters_ONLY_Second_Type.items():
203
- if data[1] >= 1:
204
- calc_only_Second_only_core(cluster, data[1], groups, cores)
205
- ###########################
206
- ### Output
207
- output_path = os.path.abspath(options.output_dir)
208
- if not os.path.exists(output_path):
209
- os.makedirs(output_path)
210
- stats_out = os.path.join(output_path,'summary_statistics.txt')
211
- key_order = ['First_core_', 'extended_core_', 'combined_core_', 'Second_core_','only_Second_core_']
212
- with open(stats_out, 'w') as outfile:
213
- print("Gene Groups:")
214
- outfile.write("Gene Groups:\n")
215
- for key_prefix in key_order:
216
- for key, value in cores.items():
217
- if key.startswith(key_prefix):
218
- print(f"{key}: {len(value)}")
219
- outfile.write(f"{key}: {len(value)}\n")
220
- print("Total Number of First Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
221
- outfile.write("Total Number of First Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
222
- if options.reclustered!= None:
223
- print("Total Number of Second Gene Groups (Including Singletons): " + str(
224
- len(combined_pangenome_clusters_Second_sequences)))
225
- print("Total Number of First Gene Groups That Had Additional Second Sequences But Not New Genomes: " + str(
226
- Number_Of_Second_Extending_But_Same_Genomes))
227
- outfile.write("\nTotal Number of Second Gene Groups (Including Singletons): " + str(
228
- len(combined_pangenome_clusters_Second_sequences)))
229
- outfile.write("\nTotal Number of First Gene Groups That Had Additional Second Sequences But Not New Genomes: " + str(
230
- Number_Of_Second_Extending_But_Same_Genomes))
231
- #Report number of first and second clusters and do the ame for genus
232
- if options.gene_presence_absence_out != False:
233
- gene_presence_absence_output(options,genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted)
234
-
235
-
236
- ###Need to fix this below. If full/partial the ifs need to be different. If full we first need to output the gfs then align. if -wruite-groups not presented then it needs
237
- # to be done for alignment full anyway...
238
-
239
- if options.run_mode == 'Full':
240
- if options.reclustered == None:
241
- combined_pangenome_clusters_Second_sequences = None
242
- if options.write_groups != None:
243
- print("Outputting gene group FASTA files")
244
- sequences = read_fasta(options.fasta)
245
- #output_dir = os.path.dirname(os.path.abspath(options.output_dir))
246
- output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
247
- write_groups(options,output_dir, key_order, cores, sequences,
248
- pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
249
-
250
- if options.align_core != None:
251
- print("Processing gene group alignment")
252
- process_gene_families(options, output_dir, 'concatenated_genes_aligned.fasta')
253
-
254
- elif options.run_mode == 'Partial':
255
- if options.reclustered == None:
256
- combined_pangenome_clusters_Second_sequences = None
257
- if options.write_groups != None and options.fasta != None:
258
- print("Outputting gene group FASTA files")
259
- sequences = read_fasta(options.fasta)
260
- output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
261
- write_groups(options,output_dir, key_order, cores, sequences,
262
- pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
263
-
264
- if options.align_core != None:
265
- print("Processing gene group alignment")
266
- process_gene_families(options, output_dir, 'concatenated_genes_aligned.fasta')
267
-
268
-
269
-
270
- #
271
- # if options.align_core != None:
272
- # #output_dir = os.path.dirname(os.path.abspath(options.output_dir))
273
- # output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
274
- # if not os.path.exists(output_dir):
275
- # os.makedirs(output_dir)
276
- # process_gene_families(options, output_dir, 'concatenated_genes_aligned.fasta')
277
-
278
- #
279
- # elif options.run_mode == 'Partial':
280
- # if options.align_core != None and options.fasta != None and options.write_groups != None:
281
- # process_gene_families(options, os.path.join(output_dir, 'Gene_Families_Output'), 'concatenated_genes_aligned.fasta')
282
- #
283
- #
284
- #
285
- #
286
- #
287
-
@@ -1,67 +0,0 @@
1
- import argparse
2
-
3
-
4
- try:
5
- from .Constants import *
6
- from .utils import *
7
- except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
8
- from Constants import *
9
- from utils import *
10
-
11
-
12
-
13
- def main():
14
- parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': Seq-Combiner - A tool to extract sequences from GFF/FASTA files and prepare them for PyamilySeq.')
15
- ### Required Arguments
16
- required = parser.add_argument_group('Required Arguments')
17
- required.add_argument('-input_dir', action='store', dest='input_dir',
18
- help='Directory location where the files are located.',
19
- required=True)
20
- required.add_argument('-input_type', action='store', dest='input_type', choices=['separate', 'combined', 'fasta'],
21
- help='Type of input files: "separate" for separate FASTA and GFF files,'
22
- ' "combined" for GFF files with embedded FASTA sequences and "fasta" for combining multiple '
23
- 'FASTA files together.',
24
- required=True)
25
- required.add_argument("-name_split", action="store", dest="name_split",
26
- help="substring used to split the filename and extract the genome name ('_combined.gff3' or '.gff').",
27
- required=True)
28
- required.add_argument("-output_dir", action="store", dest="output_dir",
29
- help="Directory for all output files.",
30
- required=True)
31
- required.add_argument("-output_name", action="store", dest="output_file",
32
- help="Output file name.",
33
- required=True)
34
-
35
- optional = parser.add_argument_group('Optional Arguments')
36
- optional.add_argument('-gene_ident', action='store', dest='gene_ident', default='CDS',
37
- help='Default - "CDS": Identifier used for extraction of sequences such as "misc_RNA,gene,mRNA,CDS,rRNA,tRNA,tmRNA,CRISPR,ncRNA,regulatory_region,oriC,pseudo"'
38
- ' - Not compatible with "fasta" input mode.',
39
- required=False)
40
- optional.add_argument('-translate', action='store_true', dest='translate', default=None,
41
- help='Default - False: Translate extracted sequences to their AA counterpart?',
42
- required=False)
43
- misc = parser.add_argument_group('Misc Arguments')
44
- misc.add_argument("-v", "--version", action="version",
45
- version=f"PyamilySeq: Seq-Combiner version {PyamilySeq_Version} - Exiting",
46
- help="Print out version number and exit")
47
-
48
- options = parser.parse_args()
49
-
50
- if options.version:
51
- sys.exit(PyamilySeq_Version)
52
-
53
- output_path = os.path.abspath(options.output_dir)
54
- if not os.path.exists(output_path):
55
- os.makedirs(output_path)
56
-
57
- combined_out_file = os.path.join(output_path, options.output_file)
58
-
59
- if options.input_type == 'separate':
60
- read_separate_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, options.translate)
61
- elif options.input_type == 'combined':
62
- read_combined_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, options.translate)
63
- elif options.input_type == 'fasta':
64
- read_fasta_files(options.input_dir, options.name_split, combined_out_file, options.translate)
65
-
66
- if __name__ == "__main__":
67
- main()
PyamilySeq/__init__.py DELETED
File without changes