PyamilySeq 0.7.1__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
PyamilySeq/Constants.py CHANGED
@@ -1,2 +1,2 @@
1
- PyamilySeq_Version = 'v0.7.1'
1
+ PyamilySeq_Version = 'v0.8.1'
2
2
 
@@ -0,0 +1,350 @@
1
+ import subprocess
2
+ import os
3
+ import argparse
4
+ from collections import defaultdict, OrderedDict
5
+ from line_profiler_pycharm import profile
6
+
7
+ try:
8
+ from .Constants import *
9
+ from .utils import *
10
+ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
11
+ from Constants import *
12
+ from utils import *
13
+
14
+ def run_cd_hit(options, input_file, clustering_output, clustering_mode):
15
+ cdhit_command = [
16
+ clustering_mode,
17
+ '-i', input_file,
18
+ '-o', clustering_output,
19
+ '-c', str(options.pident),
20
+ '-s', str(options.len_diff),
21
+ '-T', str(options.clustering_threads),
22
+ '-M', str(options.clustering_memory),
23
+ '-d', "0",
24
+ '-sc', "1",
25
+ '-sf', "1"
26
+ ]
27
+ if options.verbose:
28
+ subprocess.run(cdhit_command)
29
+ else:
30
+ subprocess.run(cdhit_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
31
+
32
+
33
+ def calculate_new_rep_seq(cluster_data):
34
+ total_length = sum(entry['length'] for entry in cluster_data)
35
+ avg_length = total_length / len(cluster_data)
36
+
37
+ total_identity = sum(entry['percent_identity'] for entry in cluster_data)
38
+ avg_identity = total_identity / len(cluster_data)
39
+
40
+ # Calculate a score based on both length difference and percent identity
41
+ def score(entry):
42
+ length_diff = abs(entry['length'] - avg_length)
43
+ identity_diff = abs(entry['percent_identity'] - avg_identity)
44
+ return length_diff + (100 - identity_diff) # You can weight these differently
45
+
46
+ rep_entry = min(cluster_data, key=score)
47
+ return rep_entry
48
+
49
+
50
+ def length_within_threshold(rep_length, length, len_diff):
51
+ return abs(rep_length - length) / rep_length <= len_diff
52
+
53
+
54
+ def check_if_all_identical(clustered_sequences):
55
+ lengths = {entry['length'] for cluster in clustered_sequences.values() for entry in cluster}
56
+ perc_idents = {entry['percent_identity'] for cluster in clustered_sequences.values() for entry in cluster}
57
+
58
+ return len(lengths) == 1 and len(perc_idents) == 1
59
+
60
+
61
+ def read_fasta_groups(fasta_file):
62
+ groups = defaultdict(list)
63
+ genome_count = defaultdict(int)
64
+ current_group = None
65
+ current_sequence = []
66
+
67
+ with open(fasta_file, 'r') as f:
68
+ for line in f:
69
+ if line.startswith('>'):
70
+ if current_group is not None:
71
+ groups[current_group].append((current_group_header, ''.join(current_sequence)))
72
+
73
+ current_group_header = line.strip()
74
+ current_group = current_group_header.split('|')[0]
75
+ genome = current_group_header.split('|')[1]
76
+ current_sequence = []
77
+ genome_count[genome] += 1
78
+ else:
79
+ current_sequence.append(line.strip())
80
+
81
+ if current_group is not None:
82
+ groups[current_group].append((current_group_header, ''.join(current_sequence)))
83
+
84
+ return groups, genome_count
85
+
86
+
87
+ def write_fasta(sequences, output_file):
88
+ with open(output_file, 'w') as f:
89
+ for header, seq in sequences:
90
+ f.write(f"{header}\n{seq}\n")
91
+
92
+
93
+ def read_cd_hit_output(clustering_output):
94
+ clusters = OrderedDict()
95
+
96
+ with open(clustering_output, 'r') as f:
97
+ current_cluster_id = None
98
+
99
+ for line in f:
100
+ line = line.strip()
101
+ if line.startswith(">Cluster"):
102
+ current_cluster_id = line.split(' ')[1]
103
+ clusters[current_cluster_id] = []
104
+ elif line and current_cluster_id is not None:
105
+ parts = line.split('\t')
106
+ if len(parts) > 1:
107
+ clustered_info = parts[1]
108
+ length = clustered_info.split(',')[0]
109
+ length = int(''.join(c for c in length if c.isdigit()))
110
+ clustered_header = clustered_info.split('>')[1].split('...')[0]
111
+ clustered_header = '>' + clustered_header
112
+
113
+ if 'at' in clustered_info:
114
+ percent_identity = extract_identity(line)
115
+
116
+ elif '*' in line:
117
+ percent_identity = 100.0
118
+ else:
119
+ raise ValueError("Percent identity not found in the string.")
120
+
121
+ clusters[current_cluster_id].append({
122
+ 'header': clustered_header,
123
+ 'length': length,
124
+ 'percent_identity': percent_identity
125
+ })
126
+
127
+ return clusters
128
+
129
+
130
+ def separate_groups(input_fasta, options, clustering_mode):
131
+ groups, genome_count = read_fasta_groups(input_fasta)
132
+
133
+ paralog_groups = defaultdict(int) # To track number of paralog groups
134
+
135
+ for group_header, sequences in groups.items():
136
+ group_name = group_header.split('|')[0] # Get the group part (e.g., '>Group_n')
137
+
138
+ # Count genomes with more than one gene
139
+ genome_to_gene_count = defaultdict(int)
140
+ for header, _ in sequences:
141
+ genome = header.split('|')[1]
142
+ genome_to_gene_count[genome] += 1
143
+
144
+ num_genomes_with_multiple_genes = sum(1 for count in genome_to_gene_count.values() if count > 1)
145
+ total_genomes = len(genome_to_gene_count)
146
+
147
+ # Check if the group meets the threshold for having paralogs
148
+ if total_genomes == 0 or (num_genomes_with_multiple_genes / total_genomes) * 100 < options.percent_threshold:
149
+ continue
150
+
151
+ group_file_name = group_name.replace('>','')
152
+
153
+ temp_fasta = f"{options.output_dir}/{group_file_name}.fasta"
154
+ write_fasta(sequences, temp_fasta)
155
+
156
+ # Run cd-hit on the individual group
157
+ clustering_output = f"{options.output_dir}/{group_file_name}_clustering"
158
+
159
+ run_cd_hit(options, temp_fasta, clustering_output, clustering_mode)
160
+
161
+ # Read the clustering results to find subgroups
162
+ clustered_sequences = read_cd_hit_output(clustering_output + '.clstr')
163
+
164
+ # Detect if all sequences are identical in length and percentage identity
165
+ all_same = check_if_all_identical(clustered_sequences)
166
+
167
+ # **Global subgroup counter for the entire major group**
168
+ subgroup_id = 0
169
+ remaining_sequences = sequences.copy() # Track unprocessed sequences
170
+ sequences_to_remove = []
171
+
172
+ if not all_same:
173
+ while remaining_sequences:
174
+ # Track subgroups for this pass
175
+ subgroup_sequences = []
176
+ genome_seen = set()
177
+ sequences_found = False # Track if any sequence was added
178
+
179
+ # Recalculate representative sequence dynamically based on remaining genes
180
+ rep = calculate_new_rep_seq(
181
+ [entry for cluster in clustered_sequences.values() for entry in cluster if
182
+ entry['header'] in (h for h, _ in remaining_sequences)]
183
+ )
184
+
185
+ # Find the sequence corresponding to rep['header'] from the list of sequences
186
+ rep_seq = next((seq for header, seq in sequences if header == rep['header']), None)
187
+
188
+ # Process each genome to select the best matching sequence
189
+ for genome in genome_to_gene_count:
190
+ best_sequence = None
191
+ best_score = -1 # Initialize with a very low similarity score
192
+
193
+ # Iterate over each sequence in the remaining sequences for this genome
194
+ for header, seq in remaining_sequences:
195
+ genome_id = header.split('|')[1]
196
+
197
+ if genome_id == genome: # Ensure this sequence belongs to the current genome
198
+
199
+ length = len(seq)
200
+ if rep_seq == seq:
201
+ perc_ident = 100.0
202
+ else:
203
+ perc_ident = calculate_similarity(rep_seq, seq) # Define a function to calculate similarity
204
+
205
+ # Calculate the length difference ratio (smaller ratio means closer length to the representative)
206
+ length_diff_ratio = abs(rep['length'] - length) / rep['length']
207
+
208
+ # Check if this sequence is more similar than the current best one
209
+ if length_within_threshold(rep['length'], length,
210
+ options.len_diff) and perc_ident >= options.pident:
211
+
212
+ # Combine percentage identity and length difference into a single score
213
+ # Here, you want a high identity and a small length difference
214
+ # Adjust the weight of length difference and similarity according to your requirements
215
+ score = perc_ident - (length_diff_ratio * 100) # Weighting length diff (you can adjust the *100 factor)
216
+
217
+ # Check if this sequence has a higher score than the current best
218
+ if score > best_score:
219
+ best_score = score
220
+ best_sequence = (header, seq) # Store the best matching sequence for this genome
221
+
222
+ # Once the best sequence is identified, add it to the subgroup
223
+ if best_sequence is not None:
224
+ sequences_found = True # At least one sequence was added
225
+ new_header = f">{group_file_name}_subgroup_{subgroup_id}|{best_sequence[0].split('|')[1]}|{best_sequence[0].split('|')[2]}"
226
+ subgroup_sequences.append((new_header, best_sequence[1]))
227
+ sequences_to_remove.append(best_sequence)
228
+ genome_seen.add(genome)
229
+
230
+ # If no sequences were found for this pass, exit the loop
231
+ # if not sequences_found:
232
+ # break
233
+
234
+ # Write each subgroup into a separate FASTA file
235
+ if subgroup_sequences:
236
+ subgroup_file = f"{options.output_dir}/{group_file_name}_subgroup_{subgroup_id}.fasta"
237
+ write_fasta(subgroup_sequences, subgroup_file)
238
+
239
+ # Remove processed sequences from the remaining list
240
+ remaining_sequences = [item for item in remaining_sequences if
241
+ item[0] not in {h for h, _ in sequences_to_remove}]
242
+
243
+ # Increment subgroup ID globally for the next subgroup
244
+ subgroup_id += 1
245
+ paralog_groups[group_name] += 1 # Count this group as a paralog group
246
+
247
+
248
+ else:
249
+ # Condition 2: If sequences are identical, distribute genes evenly into subgroups
250
+ num_subgroups = 1000
251
+ subgroup_sequences = defaultdict(list) # Store sequences for each subgroup
252
+ genome_count = defaultdict(int) # Count how many genes have been assigned to each genome
253
+
254
+ # Iterate over all sequences regardless of whether the genome has been seen
255
+ for header, seq in sequences:
256
+ genome = header.split('|')[1]
257
+
258
+ # Determine the next subgroup for this genome
259
+ subgroup_id = genome_count[genome] % num_subgroups
260
+ new_header = f"{group_file_name}_subgroup_{subgroup_id}|{genome}|{header.split('|')[2]}"
261
+ subgroup_sequences[subgroup_id].append((new_header, seq))
262
+
263
+ # Increment the count for this genome
264
+ genome_count[genome] += 1
265
+
266
+ # Write out each subgroup to a separate FASTA file
267
+ for subgroup_id, seqs in subgroup_sequences.items():
268
+ subgroup_file = f"{options.output_dir}/{group_file_name}_subgroup_{subgroup_id}.fasta"
269
+ write_fasta(seqs, subgroup_file)
270
+
271
+ # Increment subgroup ID globally for the next subgroup
272
+ subgroup_id += 1
273
+ paralog_groups[group_name] += 1 # Count this group as a paralog group
274
+
275
+
276
+
277
+ # Clean up temporary fasta file if the option is set
278
+ if options.delete_temp_files:
279
+ if temp_fasta and os.path.exists(temp_fasta):
280
+ os.remove(temp_fasta)
281
+ if os.path.exists(clustering_output + '.clstr'):
282
+ os.remove(clustering_output + '.clstr')
283
+ if os.path.exists(clustering_output):
284
+ os.remove(clustering_output)
285
+
286
+ # Print metrics about paralog groups
287
+ print(f"Identified {len(paralog_groups)} paralog groups:")
288
+ for group_id, count in paralog_groups.items():
289
+ print(f"Group ID: {group_id}, Number of new groups: {count}")
290
+
291
+
292
+ def main():
293
+ parser = argparse.ArgumentParser(description='Group-Splitter: ' + PyamilySeq_Version + ': A tool to split "paralogous" groups identified by PyamilySeq.')
294
+ ### Required Arguments
295
+ required = parser.add_argument_group('Required Arguments')
296
+ required.add_argument('-input_fasta', action='store', dest='input_fasta',
297
+ help='Input FASTA file containing gene groups.',
298
+ required=True)
299
+ required.add_argument('-sequence_type', action='store', dest='sequence_type', default='DNA',choices=['AA', 'DNA'],
300
+ help='Default - DNA: Are groups "DNA" or "AA" sequences?',
301
+ required=False)
302
+ required.add_argument('-output_dir', action='store', dest='output_dir',
303
+ help='Output directory.',
304
+ required=True)
305
+
306
+ optional = parser.add_argument_group('Optional Arguments')
307
+
308
+ optional.add_argument('-pident', action='store', dest='pident', type=float, default=0.9,
309
+ help='Sequence identity threshold (default: 0.9)')
310
+ optional.add_argument('-len_diff', action='store', dest='len_diff', type=float, default=0.05,
311
+ help='Length difference threshold (default: 0.05)')
312
+ optional.add_argument('-clustering_threads', action='store', dest='clustering_threads', type=int, default=4,
313
+ help='Number of threads for clustering (default: 4)')
314
+ optional.add_argument('-clustering_memory', action='store', dest='clustering_memory', type=int, default=2000,
315
+ help='Memory limit in MB for clustering (default: 2000)')
316
+ optional.add_argument('-percent_threshold', action='store', dest='percent_threshold', type=float, default=80,
317
+ help='Minimum percentage of genomes with paralogs (default: 80.0)')
318
+ optional.add_argument('-verbose', action='store_true', dest='verbose', help='Print verbose output.')
319
+ optional.add_argument('-no_delete_temp_files', action='store_false', dest='delete_temp_files',
320
+ help='Default: Delete all temporary files after processing.')
321
+
322
+ misc = parser.add_argument_group('Misc Arguments')
323
+ misc.add_argument('-v', action='store_true', dest='version',
324
+ help='Print out version number and exit',
325
+ required=False)
326
+
327
+ options = parser.parse_args()
328
+
329
+ # Check for version flag
330
+ if options.version:
331
+ print(f"Group-Splitter version {PyamilySeq_Version}")
332
+ exit(0)
333
+
334
+ options = parser.parse_args()
335
+
336
+ if not os.path.exists(options.output_dir):
337
+ os.makedirs(options.output_dir)
338
+
339
+ if options.sequence_type == 'DNA':
340
+ clustering_mode = 'cd-hit-est'
341
+ else:
342
+ clustering_mode = 'cd-hit'
343
+
344
+ separate_groups(options.input_fasta, options, clustering_mode)
345
+
346
+ print("Done")
347
+
348
+
349
+ if __name__ == "__main__":
350
+ main()
PyamilySeq/PyamilySeq.py CHANGED
@@ -27,7 +27,7 @@ def run_cd_hit(options, input_file, clustering_output, clustering_mode):
27
27
  '-o', clustering_output,
28
28
  '-c', str(options.pident),
29
29
  '-s', str(options.len_diff),
30
- '-T', str(options.clustering_threads),
30
+ '-T', str(options.threads),
31
31
  '-M', str(options.clustering_memory),
32
32
  '-d', "0",
33
33
  '-sc', "1",
@@ -41,7 +41,6 @@ def run_cd_hit(options, input_file, clustering_output, clustering_mode):
41
41
 
42
42
  def main():
43
43
  parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': A tool that groups genes into unique clusters.')
44
- vparser = argparse.ArgumentParser()
45
44
  ### Required Arguments
46
45
  required = parser.add_argument_group('Required Arguments')
47
46
  required.add_argument('-run_mode', action='store', dest='run_mode', choices=['Full','Partial'],
@@ -85,8 +84,8 @@ def main():
85
84
  clustering_args.add_argument("-mem", action="store", dest="clustering_memory", type=int, default=4000,
86
85
  help="Default 4000: Memory to be allocated for clustering (in MBs).",
87
86
  required=False)
88
- clustering_args.add_argument("-t", action="store", dest="clustering_threads", type=int, default=4,
89
- help="Default 4: Threads to be allocated for clustering.",
87
+ clustering_args.add_argument("-t", action="store", dest="threads", type=int, default=8,
88
+ help="Default 8: Threads to be allocated for clustering and/or alignment.",
90
89
  required=False)
91
90
 
92
91
  ###Partial-Mode Arguments
@@ -125,28 +124,19 @@ def main():
125
124
  output_args.add_argument('-original_fasta', action='store', dest='original_fasta',
126
125
  help='FASTA file to use in conjunction with "-w" or "-con" when running in Partial Mode.',
127
126
  required=False)
128
- output_args.add_argument('-gpa', action='store_true', dest='gene_presence_absence_out', default=None,
129
- help='Default - False: If selected, a Roary/Panaroo formatted gene_presence_absence.csv will be created - Required for Coinfinder and other downstream tools',
127
+ output_args.add_argument('-no_gpa', action='store_false', dest='gene_presence_absence_out',
128
+ help='Do not create a Roary/Panaroo formatted gene_presence_absence.csv (created by default) - Required for Coinfinder and other downstream tools',
130
129
  required=False)
131
130
 
132
131
  ### Misc Arguments
133
132
  misc = parser.add_argument_group('Misc')
134
- misc.add_argument('-verbose', action='store_true', dest='verbose', default=None, help='Default - False: Print out runtime messages',
135
- required = False)
136
-
137
- ### Version Arguments
138
- version = vparser.add_argument_group('Version')
139
- version.add_argument('-v', action='store_true', dest='version',
133
+ misc.add_argument('-verbose', action='store_true', dest='verbose', default=None,
134
+ help='Default - False: Print out runtime messages',
135
+ required = False)
136
+ misc.add_argument('-v', action='store_true', dest='version',
140
137
  help='Default - False: Print out version number and exit',
141
138
  required=False)
142
139
 
143
-
144
-
145
- args, unknown = vparser.parse_known_args()
146
-
147
- if args.version == True:
148
- sys.exit("PyamilySeq version: "+PyamilySeq_Version)
149
-
150
140
  options = parser.parse_args()
151
141
 
152
142
  ### Checking all required parameters are provided by user #!!# Doesn't seem to work
@@ -265,6 +255,7 @@ def main():
265
255
  self.output_dir = options.output_dir
266
256
  self.gene_presence_absence_out = options.gene_presence_absence_out
267
257
  self.write_groups = options.write_groups
258
+ self.threads = options.threads
268
259
  self.align_core = options.align_core
269
260
  self.fasta = combined_out_file
270
261
  self.verbose = options.verbose
@@ -283,6 +274,7 @@ def main():
283
274
  self.output_dir = options.output_dir
284
275
  self.gene_presence_absence_out = options.gene_presence_absence_out
285
276
  self.write_groups = options.write_groups
277
+ self.threads = options.threads
286
278
  self.align_core = options.align_core
287
279
  self.fasta = options.original_fasta
288
280
  self.verbose = options.verbose
@@ -299,5 +291,5 @@ def main():
299
291
  "Please report any issues to: https://github.com/NickJD/PyamilySeq/issues\n#####")
300
292
 
301
293
  if __name__ == "__main__":
302
- #print("Running PyamilySeq "+PyamilySeq_Version)
294
+ print("Running PyamilySeq "+PyamilySeq_Version)
303
295
  main()
@@ -199,7 +199,7 @@ def cluster(options):
199
199
  outfile.write("\nTotal Number of First Gene Groups That Had Additional Second Sequences But Not New Genomes: " + str(
200
200
  Number_Of_Second_Extending_But_Same_Genomes))
201
201
 
202
- if options.gene_presence_absence_out != None:
202
+ if options.gene_presence_absence_out != False:
203
203
  gene_presence_absence_output(options,genus_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted)
204
204
 
205
205
  if options.run_mode == 'Full':
@@ -12,6 +12,8 @@ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
12
12
  from utils import *
13
13
 
14
14
 
15
+ #def output_fasta(options, gene_families):
16
+
15
17
  def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
16
18
  print("Outputting gene_presence_absence file")
17
19
  output_dir = os.path.abspath(options.output_dir)
@@ -227,7 +229,7 @@ def cluster(options):
227
229
  outfile.write("\nTotal Number of First Gene Groups That Had Additional Second Sequences But Not New Genomes: " + str(
228
230
  Number_Of_Second_Extending_But_Same_Genomes))
229
231
  #Report number of first and second clusters and do the ame for genus
230
- if options.gene_presence_absence_out != None:
232
+ if options.gene_presence_absence_out != False:
231
233
  gene_presence_absence_output(options,genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted)
232
234
 
233
235
 
@@ -255,7 +257,6 @@ def cluster(options):
255
257
  if options.write_groups != None and options.fasta != None:
256
258
  print("Outputting gene group FASTA files")
257
259
  sequences = read_fasta(options.fasta)
258
- #output_dir = os.path.dirname(os.path.abspath(options.output_dir))
259
260
  output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
260
261
  write_groups(options,output_dir, key_order, cores, sequences,
261
262
  pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
PyamilySeq/utils.py CHANGED
@@ -5,6 +5,8 @@ import glob
5
5
  import collections
6
6
  from tempfile import NamedTemporaryFile
7
7
  import sys
8
+ from line_profiler_pycharm import profile
9
+ import re
8
10
 
9
11
 
10
12
  ################### We are currently fixed using Table 11
@@ -30,6 +32,66 @@ def translate_frame(sequence):
30
32
  translate = ''.join([gencode.get(sequence[3 * i:3 * i + 3], 'X') for i in range(len(sequence) // 3)])
31
33
  return translate
32
34
 
35
+ @profile
36
+ def calculate_similarity(seq1, seq2):
37
+ len1, len2 = len(seq1), len(seq2)
38
+
39
+ # If lengths are the same, directly compare without alignment
40
+ if len1 == len2:
41
+ matches = sum(c1 == c2 for c1, c2 in zip(seq1, seq2))
42
+ return (matches / len1) * 100 # Return similarity based on the length
43
+
44
+ # For different lengths, proceed with global alignment
45
+ # Initialize the scoring matrix
46
+ score_matrix = [[0] * (len2 + 1) for _ in range(len1 + 1)]
47
+
48
+ # Fill the first row and first column with gap penalties
49
+ for i in range(len1 + 1):
50
+ score_matrix[i][0] = -i # Gap penalty for seq1
51
+ for j in range(len2 + 1):
52
+ score_matrix[0][j] = -j # Gap penalty for seq2
53
+
54
+ # Fill the score matrix
55
+ for i in range(1, len1 + 1):
56
+ for j in range(1, len2 + 1):
57
+ match = score_matrix[i - 1][j - 1] + (1 if seq1[i - 1] == seq2[j - 1] else -1)
58
+ delete = score_matrix[i - 1][j] - 1 # Gap in seq2
59
+ insert = score_matrix[i][j - 1] - 1 # Gap in seq1
60
+ score_matrix[i][j] = max(match, delete, insert)
61
+
62
+ # Traceback to find the alignment (if needed for detailed output)
63
+ aligned_seq1, aligned_seq2 = "", ""
64
+ i, j = len1, len2
65
+
66
+ while i > 0 or j > 0:
67
+ current_score = score_matrix[i][j]
68
+ if i > 0 and j > 0 and current_score == score_matrix[i - 1][j - 1] + (1 if seq1[i - 1] == seq2[j - 1] else -1):
69
+ aligned_seq1 += seq1[i - 1]
70
+ aligned_seq2 += seq2[j - 1]
71
+ i -= 1
72
+ j -= 1
73
+ elif i > 0 and current_score == score_matrix[i - 1][j] - 1:
74
+ aligned_seq1 += seq1[i - 1]
75
+ aligned_seq2 += "-"
76
+ i -= 1
77
+ else:
78
+ aligned_seq1 += "-"
79
+ aligned_seq2 += seq2[j - 1]
80
+ j -= 1
81
+
82
+ # Reverse the aligned sequences if needed
83
+ aligned_seq1 = aligned_seq1[::-1]
84
+ aligned_seq2 = aligned_seq2[::-1]
85
+
86
+ # Calculate matches from aligned sequences
87
+ matches = sum(c1 == c2 for c1, c2 in zip(aligned_seq1, aligned_seq2))
88
+
89
+ # Calculate the similarity percentage based on the maximum length
90
+ max_length = max(len(seq1), len(seq2))
91
+ return (matches / max_length) * 100
92
+
93
+
94
+
33
95
  def is_tool_installed(tool_name):
34
96
  """Check if a tool is installed and available in PATH."""
35
97
  # Check if the tool is in the system PATH
@@ -49,12 +111,23 @@ def reverse_complement(seq):
49
111
  complement = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'N': 'N'}
50
112
  return ''.join(complement[base] for base in reversed(seq))
51
113
 
114
+
52
115
  def fix_path(path):
53
116
  fixed_path = os.path.normpath(path)
54
117
  fixed_path = os.path.realpath(fixed_path)
55
118
  return fixed_path
56
119
 
57
120
 
121
+ def extract_identity(clustered_info):
122
+ # Use regular expressions to capture the percentage value at the end of the line
123
+ match = re.search(r'at ([-+]*)(\d+\.\d+)%', clustered_info)
124
+
125
+ if match:
126
+ percent_identity = float(match.group(2)) # Extract the percentage value
127
+ return percent_identity
128
+ else:
129
+ raise ValueError("Percent identity not found in the string.")
130
+
58
131
  def wrap_sequence(sequence, width=60):
59
132
  wrapped_sequence = []
60
133
  for i in range(0, len(sequence), width):
@@ -111,14 +184,15 @@ def run_mafft_on_sequences(options, sequences, output_file):
111
184
  with open(output_file, 'w') as output_f:
112
185
  if options.verbose == True:
113
186
  subprocess.run(
114
- ['mafft', '--auto', temp_input_file_path],
187
+ ['mafft', '--auto', '--thread', str(options.threads), temp_input_file_path],
115
188
  stdout=output_f,
116
189
  stderr=sys.stderr,
117
190
  check=True
118
191
  )
192
+
119
193
  else:
120
194
  subprocess.run(
121
- ['mafft', '--auto', temp_input_file_path],
195
+ ['mafft', '--auto', '--thread', str(options.threads), temp_input_file_path],
122
196
  stdout=output_f,
123
197
  stderr=subprocess.DEVNULL, # Suppress stderr
124
198
  check=True
@@ -265,30 +339,57 @@ def read_fasta_files(input_dir, name_split, combined_out, translate):
265
339
  combined_out_file.write(f">{genome_name}|{id}\n{wrapped_sequence}\n")
266
340
 
267
341
 
268
- def write_groups(options,output_dir, key_order, cores, sequences, pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences):
342
+ def write_groups(options, output_dir, key_order, cores, sequences,
343
+ pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences):
344
+ """
345
+ Writes individual FASTA files and a combined FASTA file for all sequences.
346
+
347
+ Parameters:
348
+ - options: Command-line options.
349
+ - output_dir: Directory where output FASTA files will be saved.
350
+ - key_order: The order in which to process keys.
351
+ - cores: Dictionary of core genes.
352
+ - sequences: Dictionary mapping headers to sequences.
353
+ - pangenome_clusters_First_sequences_sorted: Dictionary of first sequence clusters.
354
+ - combined_pangenome_clusters_Second_sequences: Dictionary of second sequence clusters.
355
+ """
269
356
  # Create output directory if it doesn't exist
270
357
  if not os.path.exists(output_dir):
271
358
  os.makedirs(output_dir)
272
- for key_prefix in key_order:
273
- for key, values in cores.items():
274
- if any(part in options.write_groups.split(',') for part in key.split('_')):
275
- if key.startswith(key_prefix):
276
- for value in values:
277
- output_filename = f"{key}_{value}.fasta"
278
- if 'First' in key_prefix:
279
- sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
280
- else: # combined_pangenome_clusters_Second_sequences is None if reclustered isn't being used
281
- sequences_to_write = combined_pangenome_clusters_Second_sequences[value]
282
- # Write sequences to output file that are in the sequences dictionary
283
- with open(os.path.join(output_dir, output_filename), 'w') as outfile:
284
- for header in sequences_to_write:
285
- if header in sequences:
286
- outfile.write(f">{header}\n")
287
- wrapped_sequence = wrap_sequence(sequences[header])
288
- outfile.write(f"{wrapped_sequence}\n")
289
- else:
290
- if options.verbose == True:
291
- print("Sequence " + header + " Not found in original_fasta file.")
359
+
360
+ combined_fasta_filename = os.path.join(output_dir, "combined_group_sequences.fasta")
361
+
362
+ # Open combined FASTA file for writing all sequences
363
+ with open(combined_fasta_filename, 'w') as combined_fasta:
364
+ for key_prefix in key_order:
365
+ for key, values in cores.items():
366
+ if any(part in options.write_groups.split(',') for part in key.split('_')):
367
+ if key.startswith(key_prefix):
368
+ for value in values:
369
+ output_filename = f"{key}_{value}.fasta"
370
+ if 'First' in key_prefix:
371
+ sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
372
+ else:
373
+ sequences_to_write = combined_pangenome_clusters_Second_sequences[value]
374
+
375
+ # Write individual FASTA file
376
+ with open(os.path.join(output_dir, output_filename), 'w') as outfile:
377
+ for header in sequences_to_write:
378
+ if header in sequences:
379
+ sequence = sequences[header]
380
+ outfile.write(f">{header}\n")
381
+ wrapped_sequence = wrap_sequence(sequence)
382
+ outfile.write(f"{wrapped_sequence}\n")
383
+
384
+ # Also write to the combined FASTA file
385
+ combined_fasta.write(f">Group_{value}|{header}\n")
386
+ combined_fasta.write(f"{wrapped_sequence}\n")
387
+ else:
388
+ if options.verbose:
389
+ print(f"Sequence {header} not found in original_fasta file.")
390
+
391
+ print(f"Combined FASTA file saved to: {combined_fasta_filename}")
392
+
292
393
 
293
394
  def process_gene_families(options, directory, output_file):
294
395
  """Process each gene family file to select the longest sequence per genome and concatenate aligned sequences."""
@@ -297,7 +398,7 @@ def process_gene_families(options, directory, output_file):
297
398
 
298
399
  # Iterate over each gene family file
299
400
  for gene_file in os.listdir(directory):
300
- if gene_file.endswith('.fasta'):
401
+ if gene_file.endswith('.fasta') and not gene_file.endswith('combined_group_sequences.fasta'):
301
402
  gene_path = os.path.join(directory, gene_file)
302
403
 
303
404
  # Read sequences from the gene family file
@@ -307,13 +408,15 @@ def process_gene_families(options, directory, output_file):
307
408
  longest_sequences = select_longest_gene(sequences)
308
409
 
309
410
  # Run mafft on the longest sequences
310
- aligned_file = f"{gene_file}_aligned.fasta"
411
+ aligned_file = f"{directory}/{gene_file}_aligned.fasta.tmp"
311
412
  run_mafft_on_sequences(options, {seq_id: seq for seq_id, seq in longest_sequences.values()}, aligned_file)
312
413
 
313
414
  # Read aligned sequences and concatenate them
314
415
  aligned_sequences = read_fasta(aligned_file)
315
416
  for genome, aligned_seq in aligned_sequences.items():
316
417
  genome_name = genome.split('|')[0]
418
+ if 'Group' in genome_name:
419
+ print(2)
317
420
  if genome_name not in concatenated_sequences:
318
421
  concatenated_sequences[genome_name] = ""
319
422
  concatenated_sequences[genome_name] += aligned_seq
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PyamilySeq
3
- Version: 0.7.1
3
+ Version: 0.8.1
4
4
  Summary: PyamilySeq - A a tool to look for sequence-based gene groups identified by clustering methods such as CD-HIT, DIAMOND, BLAST or MMseqs2.
5
5
  Home-page: https://github.com/NickJD/PyamilySeq
6
6
  Author: Nicholas Dimonaco
@@ -58,7 +58,7 @@ Escherichia_coli_110957|ENSB:TIZS9kbTvShDvyX Escherichia_coli_110957|ENSB:TIZS9k
58
58
  ```
59
59
  ### Example output:
60
60
  ```
61
- Running PyamilySeq v0.7.1
61
+ Running PyamilySeq v0.8.1
62
62
  Calculating Groups
63
63
  Gene Groups:
64
64
  First_core_99: 2682
@@ -80,7 +80,7 @@ PyamilySeq -run_mode Partial -group_mode Genus -clustering_format CD-HIT -output
80
80
  -cluster_file .../test_data/genus/CD-HIT/combined_cds_cd-hit_80_60.clstr -gpa
81
81
  ```
82
82
  ```commandline
83
- Running PyamilySeq v0.7.1
83
+ Running PyamilySeq v0.8.1
84
84
  Calculating Groups
85
85
  Genus Groups:
86
86
  First_genera_1: 28549
@@ -137,14 +137,14 @@ Please report any issues to: https://github.com/NickJD/PyamilySeq/issues
137
137
  ## PyamilySeq - Menu:
138
138
  ### PyamilySeq is separated into two main 'run modes', Full and Partial. They each have their own set of required and optional arguments.
139
139
  ```
140
- Running PyamilySeq v0.7.1
140
+ Running PyamilySeq v0.8.1
141
141
  usage: PyamilySeq.py [-h] -run_mode {Full,Partial} -group_mode {Species,Genus} -clustering_format {CD-HIT,TSV,CSV} -output_dir OUTPUT_DIR
142
142
  [-input_type {separate,combined}] [-input_dir INPUT_DIR] [-name_split NAME_SPLIT] [-sequence_type {AA,DNA}] [-gene_ident GENE_IDENT]
143
143
  [-pid PIDENT] [-len_diff LEN_DIFF] [-mem CLUSTERING_MEMORY] [-t CLUSTERING_THREADS] [-cluster_file CLUSTER_FILE]
144
144
  [-reclustered RECLUSTERED] [-seq_tag SEQUENCE_TAG] [-core_groups CORE_GROUPS] [-genus_groups GENUS_GROUPS] [-w WRITE_GROUPS] [-a]
145
145
  [-original_fasta ORIGINAL_FASTA] [-gpa] [-verbose] [-v]
146
146
 
147
- PyamilySeq v0.7.1: A tool that groups genes into unique clusters.
147
+ PyamilySeq v0.8.1: A tool that groups genes into unique clusters.
148
148
 
149
149
  options:
150
150
  -h, --help show this help message and exit
@@ -176,8 +176,9 @@ Full-Mode Arguments - Required when "-run_mode Full" is used:
176
176
  Clustering Runtime Arguments - Optional when "-run_mode Full" is used:
177
177
  -mem CLUSTERING_MEMORY
178
178
  Default 4000: Memory to be allocated for clustering (in MBs).
179
- -t CLUSTERING_THREADS
180
- Default 4: Threads to be allocated for clustering.
179
+ -t THREADS Default 8: Threads to be allocated for clustering
180
+ and/or alignment.
181
+
181
182
 
182
183
  Partial-Mode Arguments - Required when "-run_mode Partial" is used:
183
184
  -cluster_file CLUSTER_FILE
@@ -221,7 +222,7 @@ Seq-Combiner -input_dir .../test_data/genomes -name_split _combined.gff3 -output
221
222
  ```
222
223
  usage: Seq_Combiner.py [-h] -input_dir INPUT_DIR -input_type {separate,combined,fasta} -name_split NAME_SPLIT -output_dir OUTPUT_DIR -output_name OUTPUT_FILE [-gene_ident GENE_IDENT] [-translate] [-v]
223
224
 
224
- Seq-Combiner v0.7.1: A tool to extract sequences from GFF/FASTA files.
225
+ Seq-Combiner v0.8.1: A tool to extract sequences from GFF/FASTA files.
225
226
 
226
227
  options:
227
228
  -h, --help show this help message and exit
@@ -247,4 +248,40 @@ Misc Arguments:
247
248
 
248
249
 
249
250
  ```
251
+
252
+ ### Group-Splitter menu:
253
+
254
+ ```
255
+ usage: Group_Splitter.py [-h] -input_fasta INPUT_FASTA -output_dir OUTPUT_DIR [-pident PIDENT] [-len_diff LEN_DIFF] [-clustering_threads CLUSTERING_THREADS]
256
+ [-clustering_memory CLUSTERING_MEMORY] [-percent_threshold PERCENT_THRESHOLD] [-verbose] [-delete_temp_files] [-v]
257
+
258
+ Group-Splitter: v0.8.1: A tool to split "paralogous" groups identified by PyamilySeq.
259
+
260
+ options:
261
+ -h, --help show this help message and exit
262
+
263
+ Required Arguments:
264
+ -input_fasta INPUT_FASTA
265
+ Input FASTA file containing gene groups.
266
+ -sequence_type {AA,DNA}
267
+ Default - DNA: Are groups "DNA" or "AA" sequences?
268
+ -output_dir OUTPUT_DIR
269
+ Output directory.
270
+
271
+ Optional Arguments:
272
+ -pident PIDENT Sequence identity threshold (default: 0.9)
273
+ -len_diff LEN_DIFF Length difference threshold (default: 0.05)
274
+ -clustering_threads CLUSTERING_THREADS
275
+ Number of threads for clustering (default: 4)
276
+ -clustering_memory CLUSTERING_MEMORY
277
+ Memory limit in MB for clustering (default: 2000)
278
+ -percent_threshold PERCENT_THRESHOLD
279
+ Minimum percentage of genomes with paralogs (default: 80.0)
280
+ -verbose Print verbose output.
281
+ -delete_temp_files Delete all temporary files after processing.
282
+
283
+ Misc Arguments:
284
+ -v Print out version number and exit
285
+ ```
286
+
250
287
  ### All example input and output data can be found in the 'test_data' directory.
@@ -0,0 +1,15 @@
1
+ PyamilySeq/Constants.py,sha256=J_jZheqHCbmFVCLrY8nMe4T5VZQOQ7PbT_HmYSi58WM,31
2
+ PyamilySeq/Group_Splitter.py,sha256=wrz-vcQ2gJ40MLLczFY8te35_uYrOBuh2v-fJSIVsWo,15578
3
+ PyamilySeq/PyamilySeq.py,sha256=OAtz6b7dnvA-Qg0dnf2JXImiOtsDrDfVit7Q6DFbuPU,15265
4
+ PyamilySeq/PyamilySeq_Genus.py,sha256=hC34cHIFu8YaXXgcPyVwuWENlsxx-7mT-Qr6PAdio4U,12414
5
+ PyamilySeq/PyamilySeq_Species.py,sha256=spgS-h-lrySZBiOiB6jX6pPRaL5j8f5V1Hq3XOjBOko,14404
6
+ PyamilySeq/Seq_Combiner.py,sha256=dPDu6LlT3B-ZDn3wKZ3AeWraDgv2Tub_16l9CLc3tQ0,3353
7
+ PyamilySeq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ PyamilySeq/clusterings.py,sha256=rcWFv0IiWoS4aUNRjDDwNEL86l1wIKa4vK4htAxy8Hg,18787
9
+ PyamilySeq/utils.py,sha256=vjPSIua4E72JTWlzH4CUaRcR-Z6Nr-RQ9N_92tfZI_w,19686
10
+ PyamilySeq-0.8.1.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
11
+ PyamilySeq-0.8.1.dist-info/METADATA,sha256=weIjFQkc7ggqkPlPkSA5an8eFiUzhDyxGl9t7-rJPsA,14555
12
+ PyamilySeq-0.8.1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
13
+ PyamilySeq-0.8.1.dist-info/entry_points.txt,sha256=15BsozBN6vRWvZeQon05dY4YQT7DqP5i2TUqFWRGCvc,150
14
+ PyamilySeq-0.8.1.dist-info/top_level.txt,sha256=J6JhugUQTq4rq96yibAlQu3o4KCM9WuYfqr3w1r119M,11
15
+ PyamilySeq-0.8.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (72.2.0)
2
+ Generator: setuptools (75.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,3 +1,4 @@
1
1
  [console_scripts]
2
+ Group-Splitter = PyamilySeq.Group_Splitter:main
2
3
  PyamilySeq = PyamilySeq.PyamilySeq:main
3
4
  Seq-Combiner = PyamilySeq.Seq_Combiner:main
@@ -1,14 +0,0 @@
1
- PyamilySeq/Constants.py,sha256=4MNcQLwJguoC9fHBLbreAe-GNgNvtzYrF0MBM6BFY_s,31
2
- PyamilySeq/PyamilySeq.py,sha256=RbM6G1yU64jlb9r7QRry1vw5mQsxndM6TrvMvq3BVik,15466
3
- PyamilySeq/PyamilySeq_Genus.py,sha256=ZjD61mTW7NgmsfGfFVEXeIZoSCha9PaLtMPnqdTtacU,12413
4
- PyamilySeq/PyamilySeq_Species.py,sha256=WL6pu8hlGpnemcpu1tLFmlUlPd4vJpQSW4Om5Hclu_k,14438
5
- PyamilySeq/Seq_Combiner.py,sha256=dPDu6LlT3B-ZDn3wKZ3AeWraDgv2Tub_16l9CLc3tQ0,3353
6
- PyamilySeq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- PyamilySeq/clusterings.py,sha256=rcWFv0IiWoS4aUNRjDDwNEL86l1wIKa4vK4htAxy8Hg,18787
8
- PyamilySeq/utils.py,sha256=-0OZxmX96kOTzms8gnbFBvc5DL6NsqNHNpLpQ4UjNk8,15726
9
- PyamilySeq-0.7.1.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
10
- PyamilySeq-0.7.1.dist-info/METADATA,sha256=IpbThlfEmO-S8Nl617eQB64Xzu9GJDz19L4Jhx7lwGY,13076
11
- PyamilySeq-0.7.1.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
12
- PyamilySeq-0.7.1.dist-info/entry_points.txt,sha256=QtXD1tmnLvRAkIpGWZgXm1lfLH8GGeCwxmgoHZaTp98,102
13
- PyamilySeq-0.7.1.dist-info/top_level.txt,sha256=J6JhugUQTq4rq96yibAlQu3o4KCM9WuYfqr3w1r119M,11
14
- PyamilySeq-0.7.1.dist-info/RECORD,,