PyamilySeq 0.7.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
PyamilySeq/Constants.py CHANGED
@@ -1,2 +1,2 @@
1
- PyamilySeq_Version = 'v0.7.0'
1
+ PyamilySeq_Version = 'v0.8.0'
2
2
 
@@ -0,0 +1,335 @@
1
+ import subprocess
2
+ import os
3
+ import argparse
4
+ from collections import defaultdict, OrderedDict
5
+ from line_profiler_pycharm import profile
6
+
7
+ try:
8
+ from .Constants import *
9
+ from .utils import *
10
+ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
11
+ from Constants import *
12
+ from utils import *
13
+
14
+ def run_cd_hit(options, input_file, clustering_output, clustering_mode):
15
+ cdhit_command = [
16
+ clustering_mode,
17
+ '-i', input_file,
18
+ '-o', clustering_output,
19
+ '-c', str(options.pident),
20
+ '-s', str(options.len_diff),
21
+ '-T', str(options.clustering_threads),
22
+ '-M', str(options.clustering_memory),
23
+ '-d', "0",
24
+ '-sc', "1",
25
+ '-sf', "1"
26
+ ]
27
+ if options.verbose:
28
+ subprocess.run(cdhit_command)
29
+ else:
30
+ subprocess.run(cdhit_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
31
+
32
+
33
+ def calculate_new_rep_seq(cluster_data):
34
+ total_length = sum(entry['length'] for entry in cluster_data)
35
+ avg_length = total_length / len(cluster_data)
36
+
37
+ total_identity = sum(entry['percent_identity'] for entry in cluster_data)
38
+ avg_identity = total_identity / len(cluster_data)
39
+
40
+ # Calculate a score based on both length difference and percent identity
41
+ def score(entry):
42
+ length_diff = abs(entry['length'] - avg_length)
43
+ identity_diff = abs(entry['percent_identity'] - avg_identity)
44
+ return length_diff + (100 - identity_diff) # You can weight these differently
45
+
46
+ rep_entry = min(cluster_data, key=score)
47
+ return rep_entry
48
+
49
+
50
+ def length_within_threshold(rep_length, length, len_diff):
51
+ return abs(rep_length - length) / rep_length <= len_diff
52
+
53
+
54
+ def check_if_all_identical(clustered_sequences):
55
+ lengths = {entry['length'] for cluster in clustered_sequences.values() for entry in cluster}
56
+ perc_idents = {entry['percent_identity'] for cluster in clustered_sequences.values() for entry in cluster}
57
+
58
+ return len(lengths) == 1 and len(perc_idents) == 1
59
+
60
+
61
+ def read_fasta_groups(fasta_file):
62
+ groups = defaultdict(list)
63
+ genome_count = defaultdict(int)
64
+ current_group = None
65
+ current_sequence = []
66
+
67
+ with open(fasta_file, 'r') as f:
68
+ for line in f:
69
+ if line.startswith('>'):
70
+ if current_group is not None:
71
+ groups[current_group].append((current_group_header, ''.join(current_sequence)))
72
+
73
+ current_group_header = line.strip()
74
+ current_group = current_group_header.split('|')[0]
75
+ genome = current_group_header.split('|')[1]
76
+ current_sequence = []
77
+ genome_count[genome] += 1
78
+ else:
79
+ current_sequence.append(line.strip())
80
+
81
+ if current_group is not None:
82
+ groups[current_group].append((current_group_header, ''.join(current_sequence)))
83
+
84
+ return groups, genome_count
85
+
86
+
87
+ def write_fasta(sequences, output_file):
88
+ with open(output_file, 'w') as f:
89
+ for header, seq in sequences:
90
+ f.write(f"{header}\n{seq}\n")
91
+
92
+
93
+ def read_cd_hit_output(clustering_output):
94
+ clusters = OrderedDict()
95
+
96
+ with open(clustering_output, 'r') as f:
97
+ current_cluster_id = None
98
+
99
+ for line in f:
100
+ line = line.strip()
101
+ if line.startswith(">Cluster"):
102
+ current_cluster_id = line.split(' ')[1]
103
+ clusters[current_cluster_id] = []
104
+ elif line and current_cluster_id is not None:
105
+ parts = line.split('\t')
106
+ if len(parts) > 1:
107
+ clustered_info = parts[1]
108
+ length = clustered_info.split(',')[0]
109
+ length = int(''.join(c for c in length if c.isdigit()))
110
+ clustered_header = clustered_info.split('>')[1].split('...')[0]
111
+ clustered_header = '>' + clustered_header
112
+
113
+ if 'at +' in clustered_info:
114
+ percent_identity = float(clustered_info.split('at +/')[1].strip().replace('%', ''))
115
+
116
+ if '*' in line:
117
+ percent_identity = 100.0
118
+
119
+ clusters[current_cluster_id].append({
120
+ 'header': clustered_header,
121
+ 'length': length,
122
+ 'percent_identity': percent_identity
123
+ })
124
+
125
+ return clusters
126
+
127
+
128
+ def separate_groups(input_fasta, options, clustering_mode):
129
+ groups, genome_count = read_fasta_groups(input_fasta)
130
+
131
+ paralog_groups = defaultdict(int) # To track number of paralog groups
132
+
133
+
134
+ for group_header, sequences in groups.items():
135
+ group_name = group_header.split('|')[0] # Get the group part (e.g., '>Group_n')
136
+
137
+ # Count genomes with more than one gene
138
+ genome_to_gene_count = defaultdict(int)
139
+ for header, _ in sequences:
140
+ genome = header.split('|')[1]
141
+ genome_to_gene_count[genome] += 1
142
+
143
+ num_genomes_with_multiple_genes = sum(1 for count in genome_to_gene_count.values() if count > 1)
144
+ total_genomes = len(genome_to_gene_count)
145
+
146
+ # Check if the group meets the threshold for having paralogs
147
+ if total_genomes == 0 or (num_genomes_with_multiple_genes / total_genomes) * 100 < options.percent_threshold:
148
+ continue
149
+
150
+ group_file_name = group_name.replace('>','')
151
+
152
+ temp_fasta = f"{options.output_dir}{group_file_name}.fasta"
153
+ write_fasta(sequences, temp_fasta)
154
+
155
+ # Run cd-hit on the individual group
156
+ clustering_output = f"{options.output_dir}/{group_file_name}_clustering"
157
+ run_cd_hit(options, temp_fasta, clustering_output, clustering_mode)
158
+
159
+ # Read the clustering results to find subgroups
160
+ clustered_sequences = read_cd_hit_output(clustering_output + '.clstr')
161
+
162
+ # Detect if all sequences are identical in length and percentage identity
163
+ all_same = check_if_all_identical(clustered_sequences)
164
+
165
+ # **Global subgroup counter for the entire major group**
166
+ subgroup_id = 0
167
+ remaining_sequences = sequences.copy() # Track unprocessed sequences
168
+ sequences_to_remove = []
169
+
170
+ if not all_same:
171
+ while remaining_sequences:
172
+ # Track subgroups for this pass
173
+ subgroup_sequences = []
174
+ genome_seen = set()
175
+ sequences_found = False # Track if any sequence was added
176
+
177
+ # Recalculate representative sequence dynamically based on remaining genes
178
+ rep = calculate_new_rep_seq(
179
+ [entry for cluster in clustered_sequences.values() for entry in cluster if
180
+ entry['header'] in (h for h, _ in remaining_sequences)]
181
+ )
182
+
183
+ # Find the sequence corresponding to rep['header'] from the list of sequences
184
+ rep_seq = next((seq for header, seq in sequences if header == rep['header']), None)
185
+
186
+ # Process each genome to select the best matching sequence
187
+ for genome in genome_to_gene_count:
188
+ best_sequence = None
189
+ best_score = -1 # Initialize with a very low similarity score
190
+
191
+ # Iterate over each sequence in the remaining sequences for this genome
192
+ for header, seq in remaining_sequences:
193
+ genome_id = header.split('|')[1]
194
+
195
+ if genome_id == genome: # Ensure this sequence belongs to the current genome
196
+
197
+ length = len(seq)
198
+ if rep_seq == seq:
199
+ perc_ident = 100.0
200
+ else:
201
+ perc_ident = calculate_similarity(rep_seq, seq) # Define a function to calculate similarity
202
+
203
+ # Calculate the length difference ratio (smaller ratio means closer length to the representative)
204
+ length_diff_ratio = abs(rep['length'] - length) / rep['length']
205
+
206
+ # Check if this sequence is more similar than the current best one
207
+ if length_within_threshold(rep['length'], length,
208
+ options.len_diff) and perc_ident >= options.pident:
209
+
210
+ # Combine percentage identity and length difference into a single score
211
+ # Here, you want a high identity and a small length difference
212
+ # Adjust the weight of length difference and similarity according to your requirements
213
+ score = perc_ident - (length_diff_ratio * 100) # Weighting length diff (you can adjust the *100 factor)
214
+
215
+ # Check if this sequence has a higher score than the current best
216
+ if score > best_score:
217
+ best_score = score
218
+ best_sequence = (header, seq) # Store the best matching sequence for this genome
219
+
220
+ # Once the best sequence is identified, add it to the subgroup
221
+ if best_sequence is not None:
222
+ sequences_found = True # At least one sequence was added
223
+ new_header = f">{group_file_name}_subgroup_{subgroup_id}|{best_sequence[0].split('|')[1]}|{best_sequence[0].split('|')[2]}"
224
+ subgroup_sequences.append((new_header, best_sequence[1]))
225
+ sequences_to_remove.append(best_sequence)
226
+ genome_seen.add(genome)
227
+
228
+ # If no sequences were found for this pass, exit the loop
229
+ # if not sequences_found:
230
+ # break
231
+
232
+ # Write each subgroup into a separate FASTA file
233
+ if subgroup_sequences:
234
+ subgroup_file = f"{options.output_dir}/{group_file_name}_subgroup_{subgroup_id}.fasta"
235
+ write_fasta(subgroup_sequences, subgroup_file)
236
+
237
+ # Remove processed sequences from the remaining list
238
+ remaining_sequences = [item for item in remaining_sequences if
239
+ item[0] not in {h for h, _ in sequences_to_remove}]
240
+
241
+ # Increment subgroup ID globally for the next subgroup
242
+ subgroup_id += 1
243
+ paralog_groups[group_name] += 1 # Count this group as a paralog group
244
+
245
+
246
+ else:
247
+ # Condition 2: If sequences are identical, distribute genes evenly into subgroups
248
+ num_subgroups = 1000
249
+ subgroup_sequences = defaultdict(list) # Store sequences for each subgroup
250
+ genome_count = defaultdict(int) # Count how many genes have been assigned to each genome
251
+
252
+ # Iterate over all sequences regardless of whether the genome has been seen
253
+ for header, seq in sequences:
254
+ genome = header.split('|')[1]
255
+
256
+ # Determine the next subgroup for this genome
257
+ subgroup_id = genome_count[genome] % num_subgroups
258
+ new_header = f"{options.output_dir}/{group_file_name}_subgroup_{subgroup_id}|{genome}|{header.split('|')[2]}"
259
+ subgroup_sequences[subgroup_id].append((new_header, seq))
260
+
261
+ # Increment the count for this genome
262
+ genome_count[genome] += 1
263
+
264
+ # Write out each subgroup to a separate FASTA file
265
+ for subgroup_id, seqs in subgroup_sequences.items():
266
+ subgroup_file = f"{options.output_dir}/{group_file_name}_subgroup_{subgroup_id}.fasta"
267
+ write_fasta(seqs, subgroup_file)
268
+
269
+ # Clean up temporary fasta file if the option is set
270
+ if options.delete_temp_files:
271
+ if temp_fasta and os.path.exists(temp_fasta):
272
+ os.remove(temp_fasta)
273
+ if os.path.exists(clustering_output + '.clstr'):
274
+ os.remove(clustering_output + '.clstr')
275
+ if os.path.exists(clustering_output):
276
+ os.remove(clustering_output)
277
+
278
+ # Print metrics about paralog groups
279
+ print(f"Identified {len(paralog_groups)} paralog groups:")
280
+ for group_id, count in paralog_groups.items():
281
+ print(f"Group ID: {group_id}, Number of new groups: {count}")
282
+
283
+
284
+ def main():
285
+ parser = argparse.ArgumentParser(description='Group-Splitter: ' + PyamilySeq_Version + ': A tool to split "paralogous" groups identified by PyamilySeq.')
286
+ ### Required Arguments
287
+ required = parser.add_argument_group('Required Arguments')
288
+ required.add_argument('-input_fasta', action='store', dest='input_fasta',
289
+ help='Input FASTA file containing gene groups.',
290
+ required=True)
291
+ required.add_argument('-output_dir', action='store', dest='output_dir',
292
+ help='Output directory.',
293
+ required=True)
294
+
295
+ optional = parser.add_argument_group('Optional Arguments')
296
+
297
+ optional.add_argument('-pident', action='store', dest='pident', type=float, default=0.9,
298
+ help='Sequence identity threshold (default: 0.9)')
299
+ optional.add_argument('-len_diff', action='store', dest='len_diff', type=float, default=0.05,
300
+ help='Length difference threshold (default: 0.05)')
301
+ optional.add_argument('-clustering_threads', action='store', dest='clustering_threads', type=int, default=4,
302
+ help='Number of threads for clustering (default: 4)')
303
+ optional.add_argument('-clustering_memory', action='store', dest='clustering_memory', type=int, default=2000,
304
+ help='Memory limit in MB for clustering (default: 2000)')
305
+ optional.add_argument('-percent_threshold', action='store', dest='percent_threshold', type=float, default=80,
306
+ help='Minimum percentage of genomes with paralogs (default: 80.0)')
307
+ optional.add_argument('-verbose', action='store_true', dest='verbose', help='Print verbose output.')
308
+ optional.add_argument('-delete_temp_files', action='store_true', dest='delete_temp_files',
309
+ help='Delete all temporary files after processing.')
310
+
311
+ misc = parser.add_argument_group('Misc Arguments')
312
+ misc.add_argument('-v', action='store_true', dest='version',
313
+ help='Print out version number and exit',
314
+ required=False)
315
+
316
+ options = parser.parse_args()
317
+
318
+ # Check for version flag
319
+ if options.version:
320
+ print(f"Group-Splitter version {PyamilySeq_Version}")
321
+ exit(0)
322
+
323
+ options = parser.parse_args()
324
+
325
+ if not os.path.exists(options.output_dir):
326
+ os.makedirs(options.output_dir)
327
+
328
+ clustering_mode = 'cd-hit-est'
329
+ separate_groups(options.input_fasta, options, clustering_mode)
330
+
331
+ print("Done")
332
+
333
+
334
+ if __name__ == "__main__":
335
+ main()
PyamilySeq/PyamilySeq.py CHANGED
@@ -124,8 +124,8 @@ def main():
124
124
  output_args.add_argument('-original_fasta', action='store', dest='original_fasta',
125
125
  help='FASTA file to use in conjunction with "-w" or "-con" when running in Partial Mode.',
126
126
  required=False)
127
- output_args.add_argument('-gpa', action='store_true', dest='gene_presence_absence_out', default=None,
128
- help='Default - False: If selected, a Roary/Panaroo formatted gene_presence_absence.csv will be created - Required for Coinfinder and other downstream tools',
127
+ output_args.add_argument('-no_gpa', action='store_false', dest='gene_presence_absence_out',
128
+ help='Do not create a Roary/Panaroo formatted gene_presence_absence.csv (created by default) - Required for Coinfinder and other downstream tools',
129
129
  required=False)
130
130
 
131
131
  ### Misc Arguments
@@ -199,7 +199,7 @@ def cluster(options):
199
199
  outfile.write("\nTotal Number of First Gene Groups That Had Additional Second Sequences But Not New Genomes: " + str(
200
200
  Number_Of_Second_Extending_But_Same_Genomes))
201
201
 
202
- if options.gene_presence_absence_out != None:
202
+ if options.gene_presence_absence_out != False:
203
203
  gene_presence_absence_output(options,genus_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted)
204
204
 
205
205
  if options.run_mode == 'Full':
@@ -12,6 +12,8 @@ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
12
12
  from utils import *
13
13
 
14
14
 
15
+ #def output_fasta(options, gene_families):
16
+
15
17
  def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
16
18
  print("Outputting gene_presence_absence file")
17
19
  output_dir = os.path.abspath(options.output_dir)
@@ -227,7 +229,7 @@ def cluster(options):
227
229
  outfile.write("\nTotal Number of First Gene Groups That Had Additional Second Sequences But Not New Genomes: " + str(
228
230
  Number_Of_Second_Extending_But_Same_Genomes))
229
231
  #Report number of first and second clusters and do the ame for genus
230
- if options.gene_presence_absence_out != None:
232
+ if options.gene_presence_absence_out != False:
231
233
  gene_presence_absence_output(options,genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted)
232
234
 
233
235
 
@@ -255,7 +257,6 @@ def cluster(options):
255
257
  if options.write_groups != None and options.fasta != None:
256
258
  print("Outputting gene group FASTA files")
257
259
  sequences = read_fasta(options.fasta)
258
- #output_dir = os.path.dirname(os.path.abspath(options.output_dir))
259
260
  output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
260
261
  write_groups(options,output_dir, key_order, cores, sequences,
261
262
  pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
PyamilySeq/utils.py CHANGED
@@ -5,6 +5,7 @@ import glob
5
5
  import collections
6
6
  from tempfile import NamedTemporaryFile
7
7
  import sys
8
+ from line_profiler_pycharm import profile
8
9
 
9
10
 
10
11
  ################### We are currently fixed using Table 11
@@ -30,6 +31,66 @@ def translate_frame(sequence):
30
31
  translate = ''.join([gencode.get(sequence[3 * i:3 * i + 3], 'X') for i in range(len(sequence) // 3)])
31
32
  return translate
32
33
 
34
+ @profile
35
+ def calculate_similarity(seq1, seq2):
36
+ len1, len2 = len(seq1), len(seq2)
37
+
38
+ # If lengths are the same, directly compare without alignment
39
+ if len1 == len2:
40
+ matches = sum(c1 == c2 for c1, c2 in zip(seq1, seq2))
41
+ return (matches / len1) * 100 # Return similarity based on the length
42
+
43
+ # For different lengths, proceed with global alignment
44
+ # Initialize the scoring matrix
45
+ score_matrix = [[0] * (len2 + 1) for _ in range(len1 + 1)]
46
+
47
+ # Fill the first row and first column with gap penalties
48
+ for i in range(len1 + 1):
49
+ score_matrix[i][0] = -i # Gap penalty for seq1
50
+ for j in range(len2 + 1):
51
+ score_matrix[0][j] = -j # Gap penalty for seq2
52
+
53
+ # Fill the score matrix
54
+ for i in range(1, len1 + 1):
55
+ for j in range(1, len2 + 1):
56
+ match = score_matrix[i - 1][j - 1] + (1 if seq1[i - 1] == seq2[j - 1] else -1)
57
+ delete = score_matrix[i - 1][j] - 1 # Gap in seq2
58
+ insert = score_matrix[i][j - 1] - 1 # Gap in seq1
59
+ score_matrix[i][j] = max(match, delete, insert)
60
+
61
+ # Traceback to find the alignment (if needed for detailed output)
62
+ aligned_seq1, aligned_seq2 = "", ""
63
+ i, j = len1, len2
64
+
65
+ while i > 0 or j > 0:
66
+ current_score = score_matrix[i][j]
67
+ if i > 0 and j > 0 and current_score == score_matrix[i - 1][j - 1] + (1 if seq1[i - 1] == seq2[j - 1] else -1):
68
+ aligned_seq1 += seq1[i - 1]
69
+ aligned_seq2 += seq2[j - 1]
70
+ i -= 1
71
+ j -= 1
72
+ elif i > 0 and current_score == score_matrix[i - 1][j] - 1:
73
+ aligned_seq1 += seq1[i - 1]
74
+ aligned_seq2 += "-"
75
+ i -= 1
76
+ else:
77
+ aligned_seq1 += "-"
78
+ aligned_seq2 += seq2[j - 1]
79
+ j -= 1
80
+
81
+ # Reverse the aligned sequences if needed
82
+ aligned_seq1 = aligned_seq1[::-1]
83
+ aligned_seq2 = aligned_seq2[::-1]
84
+
85
+ # Calculate matches from aligned sequences
86
+ matches = sum(c1 == c2 for c1, c2 in zip(aligned_seq1, aligned_seq2))
87
+
88
+ # Calculate the similarity percentage based on the maximum length
89
+ max_length = max(len(seq1), len(seq2))
90
+ return (matches / max_length) * 100
91
+
92
+
93
+
33
94
  def is_tool_installed(tool_name):
34
95
  """Check if a tool is installed and available in PATH."""
35
96
  # Check if the tool is in the system PATH
@@ -265,30 +326,57 @@ def read_fasta_files(input_dir, name_split, combined_out, translate):
265
326
  combined_out_file.write(f">{genome_name}|{id}\n{wrapped_sequence}\n")
266
327
 
267
328
 
268
- def write_groups(options,output_dir, key_order, cores, sequences, pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences):
329
+ def write_groups(options, output_dir, key_order, cores, sequences,
330
+ pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences):
331
+ """
332
+ Writes individual FASTA files and a combined FASTA file for all sequences.
333
+
334
+ Parameters:
335
+ - options: Command-line options.
336
+ - output_dir: Directory where output FASTA files will be saved.
337
+ - key_order: The order in which to process keys.
338
+ - cores: Dictionary of core genes.
339
+ - sequences: Dictionary mapping headers to sequences.
340
+ - pangenome_clusters_First_sequences_sorted: Dictionary of first sequence clusters.
341
+ - combined_pangenome_clusters_Second_sequences: Dictionary of second sequence clusters.
342
+ """
269
343
  # Create output directory if it doesn't exist
270
344
  if not os.path.exists(output_dir):
271
345
  os.makedirs(output_dir)
272
- for key_prefix in key_order:
273
- for key, values in cores.items():
274
- if any(part in options.write_groups.split(',') for part in key.split('_')):
275
- if key.startswith(key_prefix):
276
- for value in values:
277
- output_filename = f"{key}_{value}.fasta"
278
- if 'First' in key_prefix:
279
- sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
280
- else: # combined_pangenome_clusters_Second_sequences is None if reclustered isn't being used
281
- sequences_to_write = combined_pangenome_clusters_Second_sequences[value]
282
- # Write sequences to output file that are in the sequences dictionary
283
- with open(os.path.join(output_dir, output_filename), 'w') as outfile:
284
- for header in sequences_to_write:
285
- if header in sequences:
286
- outfile.write(f">{header}\n")
287
- wrapped_sequence = wrap_sequence(sequences[header])
288
- outfile.write(f"{wrapped_sequence}\n")
289
- else:
290
- if options.verbose == True:
291
- print("Sequence " + header + " Not found in original_fasta file.")
346
+
347
+ combined_fasta_filename = os.path.join(output_dir, "combined_group_sequences.fasta")
348
+
349
+ # Open combined FASTA file for writing all sequences
350
+ with open(combined_fasta_filename, 'w') as combined_fasta:
351
+ for key_prefix in key_order:
352
+ for key, values in cores.items():
353
+ if any(part in options.write_groups.split(',') for part in key.split('_')):
354
+ if key.startswith(key_prefix):
355
+ for value in values:
356
+ output_filename = f"{key}_{value}.fasta"
357
+ if 'First' in key_prefix:
358
+ sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
359
+ else:
360
+ sequences_to_write = combined_pangenome_clusters_Second_sequences[value]
361
+
362
+ # Write individual FASTA file
363
+ with open(os.path.join(output_dir, output_filename), 'w') as outfile:
364
+ for header in sequences_to_write:
365
+ if header in sequences:
366
+ sequence = sequences[header]
367
+ outfile.write(f">{header}\n")
368
+ wrapped_sequence = wrap_sequence(sequence)
369
+ outfile.write(f"{wrapped_sequence}\n")
370
+
371
+ # Also write to the combined FASTA file
372
+ combined_fasta.write(f">Group_{value}|{header}\n")
373
+ combined_fasta.write(f"{wrapped_sequence}\n")
374
+ else:
375
+ if options.verbose:
376
+ print(f"Sequence {header} not found in original_fasta file.")
377
+
378
+ print(f"Combined FASTA file saved to: {combined_fasta_filename}")
379
+
292
380
 
293
381
  def process_gene_families(options, directory, output_file):
294
382
  """Process each gene family file to select the longest sequence per genome and concatenate aligned sequences."""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PyamilySeq
3
- Version: 0.7.0
3
+ Version: 0.8.0
4
4
  Summary: PyamilySeq - A a tool to look for sequence-based gene groups identified by clustering methods such as CD-HIT, DIAMOND, BLAST or MMseqs2.
5
5
  Home-page: https://github.com/NickJD/PyamilySeq
6
6
  Author: Nicholas Dimonaco
@@ -58,7 +58,7 @@ Escherichia_coli_110957|ENSB:TIZS9kbTvShDvyX Escherichia_coli_110957|ENSB:TIZS9k
58
58
  ```
59
59
  ### Example output:
60
60
  ```
61
- Running PyamilySeq v0.7.0
61
+ Running PyamilySeq v0.8.0
62
62
  Calculating Groups
63
63
  Gene Groups:
64
64
  First_core_99: 2682
@@ -80,7 +80,7 @@ PyamilySeq -run_mode Partial -group_mode Genus -clustering_format CD-HIT -output
80
80
  -cluster_file .../test_data/genus/CD-HIT/combined_cds_cd-hit_80_60.clstr -gpa
81
81
  ```
82
82
  ```commandline
83
- Running PyamilySeq v0.7.0
83
+ Running PyamilySeq v0.8.0
84
84
  Calculating Groups
85
85
  Genus Groups:
86
86
  First_genera_1: 28549
@@ -98,37 +98,36 @@ Please report any issues to: https://github.com/NickJD/PyamilySeq/issues
98
98
  ### Reclustering can be used to see where additional sequences/genes lay in relation to a contemporary pangenome/gene grouping.
99
99
  ```
100
100
  PyamilySeq -run_mode Partial -group_mode Species -clustering_format CD-HIT -output_dir .../test_data/species/CD-HIT/testing
101
- -cluster_file .../test_data/species/CD-HIT/E-coli_extracted_cds_cd-hit_90_60.clstr -gpa
102
- -reclustered .../test_data/species/CD-HIT/E-coli_extracted_cds_cd-hit_90_60_And_StORFs_cds_90_60.clstr
101
+ -cluster_file .../test_data/species/CD-HIT/E-coli_extracted_cds_cd-hit_80_60.clstr -gpa
102
+ -reclustered .../test_data/species/CD-HIT/E-coli_extracted_cds_cd-hit_80_60_And_StORFs_cds_80_60.clstr
103
103
  ```
104
104
  #### As can be seen below, the additional sequences recovered by the StORF-Reporter annotation tool have 'extended' contemporary or created entirely new gene groups. 'First' corresponds to the groups identified from the first clustering round and 'Second' for the second. In 'reclustering' mode, First_core_# groups are unaffected thus retaining the initial grouping information.
105
105
  ```commandline
106
- Running PyamilySeq v0.7.0
107
106
  Calculating Groups
108
107
  Gene Groups:
109
- First_core_99: 69
110
- First_core_95: 1002
111
- First_core_15: 4716
112
- First_core_0: 37960
113
- extended_core_99: 6
114
- extended_core_95: 73
115
- extended_core_15: 331
116
- extended_core_0: 582
117
- combined_core_99: 4
118
- combined_core_95: 88
119
- combined_core_15: 455
120
- combined_core_0: 228
108
+ First_core_99: 587
109
+ First_core_95: 1529
110
+ First_core_15: 3708
111
+ First_core_0: 29992
112
+ extended_core_99: 29
113
+ extended_core_95: 67
114
+ extended_core_15: 431
115
+ extended_core_0: 1331
116
+ combined_core_99: 2
117
+ combined_core_95: 4
118
+ combined_core_15: 5
119
+ combined_core_0: 4
121
120
  Second_core_99: 0
122
- Second_core_95: 5
123
- Second_core_15: 254
124
- Second_core_0: 3714
125
- only_Second_core_99: 6
126
- only_Second_core_95: 364
127
- only_Second_core_15: 3950
128
- only_Second_core_0: 31269
129
- Total Number of First Gene Groups (Including Singletons): 43747
130
- Total Number of Second Gene Groups (Including Singletons): 66525
131
- Total Number of First Gene Groups That Had Additional Second Sequences But Not New Genomes: 9593
121
+ Second_core_95: 6
122
+ Second_core_15: 172
123
+ Second_core_0: 1825
124
+ only_Second_core_99: 53
125
+ only_Second_core_95: 493
126
+ only_Second_core_15: 3806
127
+ only_Second_core_0: 27569
128
+ Total Number of First Gene Groups (Including Singletons): 35816
129
+ Total Number of Second Gene Groups (Including Singletons): 67728
130
+ Total Number of First Gene Groups That Had Additional Second Sequences But Not New Genomes: 136
132
131
  Outputting gene_presence_absence file
133
132
  Thank you for using PyamilySeq -- A detailed user manual can be found at https://github.com/NickJD/PyamilySeq
134
133
  Please report any issues to: https://github.com/NickJD/PyamilySeq/issues
@@ -138,14 +137,14 @@ Please report any issues to: https://github.com/NickJD/PyamilySeq/issues
138
137
  ## PyamilySeq - Menu:
139
138
  ### PyamilySeq is separated into two main 'run modes', Full and Partial. They each have their own set of required and optional arguments.
140
139
  ```
141
- Running PyamilySeq v0.7.0
140
+ Running PyamilySeq v0.8.0
142
141
  usage: PyamilySeq.py [-h] -run_mode {Full,Partial} -group_mode {Species,Genus} -clustering_format {CD-HIT,TSV,CSV} -output_dir OUTPUT_DIR
143
142
  [-input_type {separate,combined}] [-input_dir INPUT_DIR] [-name_split NAME_SPLIT] [-sequence_type {AA,DNA}] [-gene_ident GENE_IDENT]
144
143
  [-pid PIDENT] [-len_diff LEN_DIFF] [-mem CLUSTERING_MEMORY] [-t CLUSTERING_THREADS] [-cluster_file CLUSTER_FILE]
145
144
  [-reclustered RECLUSTERED] [-seq_tag SEQUENCE_TAG] [-core_groups CORE_GROUPS] [-genus_groups GENUS_GROUPS] [-w WRITE_GROUPS] [-a]
146
145
  [-original_fasta ORIGINAL_FASTA] [-gpa] [-verbose] [-v]
147
146
 
148
- PyamilySeq v0.7.0: A tool that groups genes into unique clusters.
147
+ PyamilySeq v0.8.0: A tool that groups genes into unique clusters.
149
148
 
150
149
  options:
151
150
  -h, --help show this help message and exit
@@ -198,9 +197,9 @@ Output Parameters:
198
197
  -w WRITE_GROUPS Default - No output: Output sequences of identified groups (provide levels at which to output - Species "-w 99,95" Genus "-w 2,3" -
199
198
  Must provide FASTA file with -original_fasta if in Partial run mode.
200
199
  -a Default - No output: SLOW! (Only works for Species mode) Output aligned and concatinated sequences of identified groups -provide
201
- group levels at which to output "-w 99,95" - Must provide FASTA file with -original_fasta in Partialrun mode.
200
+ group levels at which to output "-w 99,95" - Must provide FASTA file with -original_fasta in Partial run mode.
202
201
  -original_fasta ORIGINAL_FASTA
203
- FASTA file to use in conjunction with "-w" or "-con" when running in Partial Mode.
202
+ FASTA file to use in conjunction with "-w" or "-a" when running in Partial Mode.
204
203
  -gpa Default - False: If selected, a Roary/Panaroo formatted gene_presence_absence.csv will be created - Required for Coinfinder and
205
204
  other downstream tools
206
205
 
@@ -222,7 +221,7 @@ Seq-Combiner -input_dir .../test_data/genomes -name_split _combined.gff3 -output
222
221
  ```
223
222
  usage: Seq_Combiner.py [-h] -input_dir INPUT_DIR -input_type {separate,combined,fasta} -name_split NAME_SPLIT -output_dir OUTPUT_DIR -output_name OUTPUT_FILE [-gene_ident GENE_IDENT] [-translate] [-v]
224
223
 
225
- Seq-Combiner v0.7.0: A tool to extract sequences from GFF/FASTA files.
224
+ Seq-Combiner v0.8.0: A tool to extract sequences from GFF/FASTA files.
226
225
 
227
226
  options:
228
227
  -h, --help show this help message and exit
@@ -248,4 +247,38 @@ Misc Arguments:
248
247
 
249
248
 
250
249
  ```
250
+
251
+ ### Group-Splitter menu:
252
+
253
+ ```
254
+ usage: Group_Splitter.py [-h] -input_fasta INPUT_FASTA -output_dir OUTPUT_DIR [-pident PIDENT] [-len_diff LEN_DIFF] [-clustering_threads CLUSTERING_THREADS]
255
+ [-clustering_memory CLUSTERING_MEMORY] [-percent_threshold PERCENT_THRESHOLD] [-verbose] [-delete_temp_files] [-v]
256
+
257
+ Group-Splitter: v0.8.0: A tool to split "paralogous" groups identified by PyamilySeq.
258
+
259
+ options:
260
+ -h, --help show this help message and exit
261
+
262
+ Required Arguments:
263
+ -input_fasta INPUT_FASTA
264
+ Input FASTA file containing gene groups.
265
+ -output_dir OUTPUT_DIR
266
+ Output directory.
267
+
268
+ Optional Arguments:
269
+ -pident PIDENT Sequence identity threshold (default: 0.9)
270
+ -len_diff LEN_DIFF Length difference threshold (default: 0.05)
271
+ -clustering_threads CLUSTERING_THREADS
272
+ Number of threads for clustering (default: 4)
273
+ -clustering_memory CLUSTERING_MEMORY
274
+ Memory limit in MB for clustering (default: 2000)
275
+ -percent_threshold PERCENT_THRESHOLD
276
+ Minimum percentage of genomes with paralogs (default: 80.0)
277
+ -verbose Print verbose output.
278
+ -delete_temp_files Delete all temporary files after processing.
279
+
280
+ Misc Arguments:
281
+ -v Print out version number and exit
282
+ ```
283
+
251
284
  ### All example input and output data can be found in the 'test_data' directory.
@@ -0,0 +1,15 @@
1
+ PyamilySeq/Constants.py,sha256=lbVZv4vDHroA83KCDTIGuVb6bubKYZbwLmhYHxedXQc,31
2
+ PyamilySeq/Group_Splitter.py,sha256=raZMV9SN7Qqw5Hci5qpkaahR66JMQf6dX8TvThjh3kU,14986
3
+ PyamilySeq/PyamilySeq.py,sha256=0607A9nqafoQ8IhBxGgGJ-v3DVV6C6-LgzdDIXb2C-c,15179
4
+ PyamilySeq/PyamilySeq_Genus.py,sha256=hC34cHIFu8YaXXgcPyVwuWENlsxx-7mT-Qr6PAdio4U,12414
5
+ PyamilySeq/PyamilySeq_Species.py,sha256=spgS-h-lrySZBiOiB6jX6pPRaL5j8f5V1Hq3XOjBOko,14404
6
+ PyamilySeq/Seq_Combiner.py,sha256=dPDu6LlT3B-ZDn3wKZ3AeWraDgv2Tub_16l9CLc3tQ0,3353
7
+ PyamilySeq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ PyamilySeq/clusterings.py,sha256=rcWFv0IiWoS4aUNRjDDwNEL86l1wIKa4vK4htAxy8Hg,18787
9
+ PyamilySeq/utils.py,sha256=6UtYJW3_0rDhEhvrJi6R3smvKu2n_bjqUkuzr5DcJM4,19061
10
+ PyamilySeq-0.8.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
11
+ PyamilySeq-0.8.0.dist-info/METADATA,sha256=ZnpQvAQy5EXGrzS0G9y5qH2Rhmb0LW2HvOT-b5WJLoo,14436
12
+ PyamilySeq-0.8.0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
13
+ PyamilySeq-0.8.0.dist-info/entry_points.txt,sha256=15BsozBN6vRWvZeQon05dY4YQT7DqP5i2TUqFWRGCvc,150
14
+ PyamilySeq-0.8.0.dist-info/top_level.txt,sha256=J6JhugUQTq4rq96yibAlQu3o4KCM9WuYfqr3w1r119M,11
15
+ PyamilySeq-0.8.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (72.1.0)
2
+ Generator: setuptools (75.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,3 +1,4 @@
1
1
  [console_scripts]
2
+ Group-Splitter = PyamilySeq.Group_Splitter:main
2
3
  PyamilySeq = PyamilySeq.PyamilySeq:main
3
4
  Seq-Combiner = PyamilySeq.Seq_Combiner:main
@@ -1,14 +0,0 @@
1
- PyamilySeq/Constants.py,sha256=RSX5-UuBXOrbEv3ETN415RwqoB6WmNe5eD-p7L15CJA,31
2
- PyamilySeq/PyamilySeq.py,sha256=wmdOVxxRKqsamsEWgnVVCYETUaYOEQQVYERpClrg4Zw,15203
3
- PyamilySeq/PyamilySeq_Genus.py,sha256=ZjD61mTW7NgmsfGfFVEXeIZoSCha9PaLtMPnqdTtacU,12413
4
- PyamilySeq/PyamilySeq_Species.py,sha256=WL6pu8hlGpnemcpu1tLFmlUlPd4vJpQSW4Om5Hclu_k,14438
5
- PyamilySeq/Seq_Combiner.py,sha256=dPDu6LlT3B-ZDn3wKZ3AeWraDgv2Tub_16l9CLc3tQ0,3353
6
- PyamilySeq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- PyamilySeq/clusterings.py,sha256=rcWFv0IiWoS4aUNRjDDwNEL86l1wIKa4vK4htAxy8Hg,18787
8
- PyamilySeq/utils.py,sha256=-0OZxmX96kOTzms8gnbFBvc5DL6NsqNHNpLpQ4UjNk8,15726
9
- PyamilySeq-0.7.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
10
- PyamilySeq-0.7.0.dist-info/METADATA,sha256=JDhA1JdFaESNwtqzWjRgaA92lrmVFNJWQZUonHgJuvA,13105
11
- PyamilySeq-0.7.0.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
12
- PyamilySeq-0.7.0.dist-info/entry_points.txt,sha256=QtXD1tmnLvRAkIpGWZgXm1lfLH8GGeCwxmgoHZaTp98,102
13
- PyamilySeq-0.7.0.dist-info/top_level.txt,sha256=J6JhugUQTq4rq96yibAlQu3o4KCM9WuYfqr3w1r119M,11
14
- PyamilySeq-0.7.0.dist-info/RECORD,,