PyamilySeq 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,6 @@
1
1
  #from line_profiler_pycharm import profile
2
2
 
3
- import copy
4
3
  import math
5
- import sys
6
-
7
-
8
4
 
9
5
  try:
10
6
  from .Constants import *
@@ -16,44 +12,6 @@ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
16
12
  from utils import *
17
13
 
18
14
 
19
- def process_gene_families(options, directory, output_file):
20
- """Process each gene family file to select the longest sequence per genome and concatenate aligned sequences."""
21
- concatenated_sequences = {}
22
- output_file = directory.replace('Gene_Families_Output',output_file)
23
-
24
- # Iterate over each gene family file
25
- for gene_file in os.listdir(directory):
26
- if gene_file.endswith('.fasta'):
27
- gene_path = os.path.join(directory, gene_file)
28
-
29
- # Read sequences from the gene family file
30
- sequences = read_fasta(gene_path)
31
-
32
- # Select the longest sequence for each genome
33
- longest_sequences = select_longest_gene(sequences)
34
-
35
- # Run mafft on the longest sequences
36
- aligned_file = f"{gene_file}_aligned.fasta"
37
- run_mafft_on_sequences(options, {seq_id: seq for seq_id, seq in longest_sequences.values()}, aligned_file)
38
-
39
- # Read aligned sequences and concatenate them
40
- aligned_sequences = read_fasta(aligned_file)
41
- for genome, aligned_seq in aligned_sequences.items():
42
- genome_name = genome.split('|')[0]
43
- if genome_name not in concatenated_sequences:
44
- concatenated_sequences[genome_name] = ""
45
- concatenated_sequences[genome_name] += aligned_seq
46
-
47
- # Clean up aligned file
48
- os.remove(aligned_file)
49
-
50
- # Write the concatenated sequences to the output file
51
- with open(output_file, 'w') as out:
52
- for genome, sequence in concatenated_sequences.items():
53
- out.write(f">{genome}\n")
54
- wrapped_sequence = wrap_sequence(sequence, 60)
55
- out.write(f"{wrapped_sequence}\n")
56
-
57
15
  def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
58
16
  print("Outputting gene_presence_absence file")
59
17
  output_dir = os.path.abspath(options.output_dir)
@@ -128,7 +86,7 @@ def get_cores(options,genome_dict):
128
86
  #@profile
129
87
  def calc_First_only_core(cluster, First_num, groups, cores):
130
88
  groups_as_list = list(groups.values())
131
- for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= First_num <= fir):
89
+ for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= int(First_num) <= fir):
132
90
  res = idx
133
91
  family_group = list(groups)[res]
134
92
  cores['First_core_'+family_group].append(cluster)
@@ -138,6 +96,7 @@ def calc_single_First_extended_Second_only_core(cluster, First_num, groups, core
138
96
  groups_as_list = list(groups.values())
139
97
  for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= First_num+Second_num <= fir):
140
98
  res = idx
99
+
141
100
  family_group = list(groups)[res]
142
101
  cores['extended_core_' + family_group].append(cluster)
143
102
 
@@ -145,8 +104,11 @@ def calc_single_First_extended_Second_only_core(cluster, First_num, groups, core
145
104
  #@profile
146
105
  def calc_multi_First_extended_Second_only_core(cluster, First_num, groups, cores, Second_num): # Count seperately those gene families extended with StORF_Reporter but combined >1 PEP
147
106
  groups_as_list = list(groups.values())
148
- for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= First_num+Second_num <= fir):
149
- res = idx
107
+ # Looping through the list to find the matching condition
108
+ for idx, (sec, fir) in enumerate(groups_as_list):
109
+ if sec <= First_num + Second_num <= fir:
110
+ res = idx
111
+ break
150
112
  family_group = list(groups)[res]
151
113
  cores['combined_core_' + family_group].append(cluster)
152
114
 
@@ -173,9 +135,9 @@ def calc_only_Second_only_core(cluster, Second_num, groups, cores): # only count
173
135
  def cluster(options):
174
136
 
175
137
  if options.cluster_format == 'CD-HIT':
176
- genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options, '|')
177
- elif options.cluster_format in ['TSV','CSV']:
178
- genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_EdgeList(options, '|')
138
+ genome_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options, '|')
139
+ elif 'TSV' in options.cluster_format or 'CSV' in options.cluster_format:
140
+ genome_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_EdgeList(options, '|')
179
141
 
180
142
  ###
181
143
  cores, groups = get_cores(options, genome_dict)
@@ -183,11 +145,11 @@ def cluster(options):
183
145
 
184
146
  if options.reclustered != None:
185
147
  if options.cluster_format == 'CD-HIT':
186
- combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second = combined_clustering_CDHIT(options, genome_dict, '|')
187
- if options.cluster_format == ['TSV','CSV']:
188
- combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second = combined_clustering_Edge_List(options, '|')
189
-
190
- pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered, '|')
148
+ combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences = combined_clustering_CDHIT(options, genome_dict, '|')
149
+ elif 'TSV' in options.cluster_format or 'CSV' in options.cluster_format:
150
+ #Fix
151
+ combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second,combined_pangenome_clusters_Second_sequences = combined_clustering_Edge_List(options, '|')
152
+ pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered, pangenome_clusters_First_genomes, '|')
191
153
  else:
192
154
  pangenome_clusters_Type = single_clustering_counting(pangenome_clusters_First, reps)
193
155
 
@@ -201,19 +163,30 @@ def cluster(options):
201
163
  pangenome_clusters_Type_sorted = reorder_dict_by_keys(pangenome_clusters_Type, sorted_first_keys)
202
164
 
203
165
  print("Calculating Groups")
166
+ seen_groupings = []
204
167
  for cluster, numbers in pangenome_clusters_Type_sorted.items():
205
168
  ############################### Calculate First only
206
- calc_First_only_core(cluster, numbers[1],groups,cores)
169
+ cluster = str(cluster)
170
+ if '78' in cluster:
171
+ pass
172
+ for grouping in numbers[2]: #!!# Could do with a more elegant solution
173
+ current_cluster = grouping[0].split(':')[0]
174
+ if current_cluster not in seen_groupings:
175
+ seen_groupings.append(current_cluster)
176
+ current_cluster_size = grouping[0].split(':')[1]
177
+ calc_First_only_core(current_cluster, current_cluster_size,groups,cores)
178
+ ############################# Calculate First and Reclustered-Second
179
+ if numbers[0] == 1 and numbers[3] >= 1: # If Seconds did not combine First reps
180
+ calc_single_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
181
+ elif numbers[0] > 1 and numbers[3] >= 1: # If unique Seconds combined multiple Firsts
182
+ calc_multi_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
183
+ elif numbers[4] >= 1:
184
+ Number_Of_Second_Extending_But_Same_Genomes += 1
185
+ else:
186
+ if options.verbose == True:
187
+ print("First cluster " + current_cluster + " already processed - This is likely because it was clustered with another First representative.")
207
188
 
208
189
  if options.reclustered != None:
209
- ############################# Calculate First and Reclustered-Second
210
- if numbers[0] == 1 and numbers[3] >= 1: # If Seconds did not combine First reps
211
- calc_single_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
212
- elif numbers[0] > 1 and numbers[3] >= 1: # If unique Secondss combined multiple Firsts
213
- calc_multi_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
214
- elif numbers[4] >= 1:
215
- Number_Of_Second_Extending_But_Same_Genomes += 1
216
-
217
190
  combined_pangenome_clusters_ONLY_Second_Type = defaultdict(list)
218
191
  combined_pangenome_clusters_Second_Type = defaultdict(list)
219
192
  for cluster, genomes in combined_pangenome_clusters_Second.items():
@@ -230,6 +203,8 @@ def cluster(options):
230
203
  ###########################
231
204
  ### Output
232
205
  output_path = os.path.abspath(options.output_dir)
206
+ if not os.path.exists(output_path):
207
+ os.makedirs(output_path)
233
208
  stats_out = os.path.join(output_path,'summary_statistics.txt')
234
209
  key_order = ['First_core_', 'extended_core_', 'combined_core_', 'Second_core_','only_Second_core_']
235
210
  with open(stats_out, 'w') as outfile:
@@ -240,40 +215,72 @@ def cluster(options):
240
215
  if key.startswith(key_prefix):
241
216
  print(f"{key}: {len(value)}")
242
217
  outfile.write(f"{key}: {len(value)}\n")
243
- print("Total Number of Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
244
- outfile.write("Total Number of Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
245
-
218
+ print("Total Number of First Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
219
+ outfile.write("Total Number of First Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
220
+ if options.reclustered!= None:
221
+ print("Total Number of Second Gene Groups (Including Singletons): " + str(
222
+ len(combined_pangenome_clusters_Second_sequences)))
223
+ print("Total Number of First Gene Groups That Had Additional Second Sequences But Not New Genomes: " + str(
224
+ Number_Of_Second_Extending_But_Same_Genomes))
225
+ outfile.write("\nTotal Number of Second Gene Groups (Including Singletons): " + str(
226
+ len(combined_pangenome_clusters_Second_sequences)))
227
+ outfile.write("\nTotal Number of First Gene Groups That Had Additional Second Sequences But Not New Genomes: " + str(
228
+ Number_Of_Second_Extending_But_Same_Genomes))
229
+ #Report number of first and second clusters and do the ame for genus
246
230
  if options.gene_presence_absence_out != None:
247
231
  gene_presence_absence_output(options,genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted)
248
232
 
249
- if options.write_families != None and options.fasta != None:
250
- sequences = read_fasta(options.fasta)
251
- output_dir = os.path.dirname(os.path.abspath(options.output_dir))
252
- output_dir = os.path.join(output_dir, 'Gene_Families_Output')
253
-
254
- # Create output directory if it doesn't exist
255
- if not os.path.exists(output_dir):
256
- os.makedirs(output_dir)
257
- for key_prefix in key_order:
258
- for key, values in cores.items():
259
- if any(part in options.write_families.split(',') for part in key.split('_')):
260
- if key.startswith(key_prefix):
261
- for value in values:
262
- output_filename = f"{key}_{value}.fasta"
263
- sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
264
- # Write sequences to output file that are in the sequences dictionary
265
- with open(os.path.join(output_dir, output_filename), 'w') as outfile:
266
- for header in sequences_to_write:
267
- if header in sequences:
268
- outfile.write(f">{header}\n")
269
- wrapped_sequence = wrap_sequence(sequences[header])
270
- outfile.write(f"{wrapped_sequence}\n")
271
-
272
- if options.con_core != None and options.fasta != None and options.write_families != None:
273
- process_gene_families(options, os.path.join(output_dir, 'Gene_Families_Output'), 'concatonated_genes_aligned.fasta')
274
-
275
-
276
-
277
-
278
233
 
234
+ ###Need to fix this below. If full/partial the ifs need to be different. If full we first need to output the gfs then align. if -wruite-groups not presented then it needs
235
+ # to be done for alignment full anyway...
236
+
237
+ if options.run_mode == 'Full':
238
+ if options.reclustered == None:
239
+ combined_pangenome_clusters_Second_sequences = None
240
+ if options.write_groups != None:
241
+ print("Outputting gene group FASTA files")
242
+ sequences = read_fasta(options.fasta)
243
+ #output_dir = os.path.dirname(os.path.abspath(options.output_dir))
244
+ output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
245
+ write_groups(options,output_dir, key_order, cores, sequences,
246
+ pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
247
+
248
+ if options.align_core != None:
249
+ print("Processing gene group alignment")
250
+ process_gene_families(options, output_dir, 'concatenated_genes_aligned.fasta')
251
+
252
+ elif options.run_mode == 'Partial':
253
+ if options.reclustered == None:
254
+ combined_pangenome_clusters_Second_sequences = None
255
+ if options.write_groups != None and options.fasta != None:
256
+ print("Outputting gene group FASTA files")
257
+ sequences = read_fasta(options.fasta)
258
+ #output_dir = os.path.dirname(os.path.abspath(options.output_dir))
259
+ output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
260
+ write_groups(options,output_dir, key_order, cores, sequences,
261
+ pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
262
+
263
+ if options.align_core != None:
264
+ print("Processing gene group alignment")
265
+ process_gene_families(options, output_dir, 'concatenated_genes_aligned.fasta')
266
+
267
+
268
+
269
+ #
270
+ # if options.align_core != None:
271
+ # #output_dir = os.path.dirname(os.path.abspath(options.output_dir))
272
+ # output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
273
+ # if not os.path.exists(output_dir):
274
+ # os.makedirs(output_dir)
275
+ # process_gene_families(options, output_dir, 'concatenated_genes_aligned.fasta')
276
+
277
+ #
278
+ # elif options.run_mode == 'Partial':
279
+ # if options.align_core != None and options.fasta != None and options.write_groups != None:
280
+ # process_gene_families(options, os.path.join(output_dir, 'Gene_Families_Output'), 'concatenated_genes_aligned.fasta')
281
+ #
282
+ #
283
+ #
284
+ #
285
+ #
279
286
 
@@ -11,15 +11,16 @@ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
11
11
 
12
12
 
13
13
  def main():
14
- parser = argparse.ArgumentParser(description='Seq-Combiner ' + PyamilySeq_Version + ': Seq-Combiner Run Parameters.')
14
+ parser = argparse.ArgumentParser(description='Seq-Combiner ' + PyamilySeq_Version + ': A tool to extract sequences from GFF/FASTA files.')
15
15
  ### Required Arguments
16
16
  required = parser.add_argument_group('Required Arguments')
17
17
  required.add_argument('-input_dir', action='store', dest='input_dir',
18
18
  help='Directory location where the files are located.',
19
19
  required=True)
20
- required.add_argument("-input_type", action="store", dest="input_type", choices=['separate', 'combined'],
21
- help="Type of input files: 'separate' for separate FASTA and GFF files,"
22
- " 'combined' for GFF files with embedded FASTA sequences.",
20
+ required.add_argument('-input_type', action='store', dest='input_type', choices=['separate', 'combined', 'fasta'],
21
+ help='Type of input files: "separate" for separate FASTA and GFF files,'
22
+ ' "combined" for GFF files with embedded FASTA sequences and "fasta" for combining multiple '
23
+ 'FASTA files together.',
23
24
  required=True)
24
25
  required.add_argument("-name_split", action="store", dest="name_split",
25
26
  help="substring used to split the filename and extract the genome name ('_combined.gff3' or '.gff').",
@@ -30,15 +31,33 @@ def main():
30
31
  required.add_argument("-output_name", action="store", dest="output_file",
31
32
  help="Output file name.",
32
33
  required=True)
34
+ optional = parser.add_argument_group('Optional Arguments')
35
+ optional.add_argument('-gene_ident', action='store', dest='gene_ident', default='CDS',
36
+ help='Default - "CDS": Identifier used for extraction of sequences such as "misc_RNA,gene,mRNA,CDS,rRNA,tRNA,tmRNA,CRISPR,ncRNA,regulatory_region,oriC,pseudo"'
37
+ ' - Not compatible with "fasta" input mode.',
38
+ required=False)
39
+ optional.add_argument('-translate', action='store_true', dest='translate', default=None,
40
+ help='Default - False: Translate extracted sequences to their AA counterpart?',
41
+ required=False)
42
+ misc = parser.add_argument_group('Misc Arguments')
43
+ misc.add_argument('-v', action='store_true', dest='version',
44
+ help='Print out version number and exit',
45
+ required=False)
46
+
33
47
  options = parser.parse_args()
34
48
 
49
+ if options.version:
50
+ sys.exit(PyamilySeq_Version)
51
+
35
52
  output_path = os.path.abspath(options.output_dir)
36
53
  combined_out_file = os.path.join(output_path, options.output_file)
37
54
 
38
55
  if options.input_type == 'separate':
39
- read_separate_files(options.input_dir, options.name_split, )
40
- else:
41
- read_combined_files(options.input_dir, options.name_split, combined_out_file)
56
+ read_separate_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, options.translate)
57
+ elif options.input_type == 'combined':
58
+ read_combined_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, options.translate)
59
+ elif options.input_type == 'fasta':
60
+ read_fasta_files(options.input_dir, options.name_split, combined_out_file, options.translate)
42
61
 
43
62
  if __name__ == "__main__":
44
63
  main()