PyamilySeq 0.9.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,10 +3,10 @@ from collections import OrderedDict
3
3
  from collections import defaultdict
4
4
 
5
5
  try:
6
- from .Constants import *
6
+ from .constants import *
7
7
  from .utils import *
8
8
  except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
9
- from Constants import *
9
+ from constants import *
10
10
  from utils import *
11
11
 
12
12
 
@@ -1,15 +1,13 @@
1
- import collections
2
- import subprocess
3
- import os
1
+
4
2
  import argparse
5
3
  from collections import defaultdict, OrderedDict
6
- from line_profiler_pycharm import profile
4
+
7
5
 
8
6
  try:
9
- from .Constants import *
7
+ from .constants import *
10
8
  from .utils import *
11
9
  except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
12
- from Constants import *
10
+ from constants import *
13
11
  from utils import *
14
12
 
15
13
  def run_cd_hit(options, input_file, clustering_output, clustering_mode):
@@ -22,16 +20,16 @@ def run_cd_hit(options, input_file, clustering_output, clustering_mode):
22
20
  '-T', str(options.clustering_threads),
23
21
  '-M', str(options.clustering_memory),
24
22
  '-d', "0",
25
- '-g', "1",
23
+ '-g', str(options.fast_mode),
26
24
  '-sc', "1",
27
25
  '-sf', "1"
28
26
  ]
29
- if options.verbose:
27
+ if options.verbose == True:
30
28
  subprocess.run(cdhit_command)
31
29
  else:
32
30
  subprocess.run(cdhit_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
33
31
 
34
- @profile
32
+ #'@profile
35
33
  def calculate_new_rep_seq(cluster_data, length_weight=1.0, identity_weight=1.0):
36
34
  total_length = sum(entry['length'] for entry in cluster_data)
37
35
  avg_length = total_length / len(cluster_data)
@@ -54,8 +52,8 @@ def calculate_new_rep_seq(cluster_data, length_weight=1.0, identity_weight=1.0):
54
52
 
55
53
 
56
54
 
57
- def length_within_threshold(rep_length, length, len_diff):
58
- return abs(rep_length - length) / rep_length <= len_diff
55
+ #def length_within_threshold(rep_length, length, len_diff):
56
+ # return abs(rep_length - length) / rep_length <= len_diff
59
57
 
60
58
 
61
59
  def check_if_all_identical(clustered_sequences):
@@ -66,21 +64,40 @@ def check_if_all_identical(clustered_sequences):
66
64
 
67
65
 
68
66
 
69
- def read_fasta_groups(options):
67
+ def read_fasta_groups(options, groups_to_use):
70
68
  groups = defaultdict(list)
71
69
  genome_count = defaultdict(int)
72
70
  current_group = None
73
71
  current_sequence = []
74
72
 
75
- # Parse the list of specific group numbers if provided
76
- selected_groups = None
77
- if options.groups is not None:
78
- selected_groups = [int(g.strip()) for g in options.groups.split(',')]
73
+ if options.sequence_type == 'AA':
74
+ affix = '_aa.fasta'
75
+ else:
76
+ affix = '_dna.fasta'
77
+
78
+ combined_groups_fasta = options.input_directory + '/Gene_Groups_Output/combined_group_sequences' + affix
79
+
80
+ if groups_to_use[0] == 'ids':
81
+ selected_group_ids = [int(g.strip()) for g in groups_to_use[1].split(',')]
82
+ elif groups_to_use[0] == 'groups':
83
+ selected_groups = set(range(int(groups_to_use[1]), 101))
84
+ # Scan the directory for filenames that match the criteria
85
+ selected_group_ids = []
86
+ for filename in os.listdir(os.path.dirname(combined_groups_fasta)):
87
+ if 'core' in filename and filename.endswith('.fasta'):
88
+ try:
89
+ group_number = int(filename.split('_')[2])
90
+ if group_number in selected_groups:
91
+ selected_group_ids.append(int(filename.split('_')[3].split('.')[0]))
92
+ except ValueError:
93
+ continue
94
+
79
95
 
80
- with open(options.input_fasta, 'r') as f:
96
+ group_number = None
97
+ with open(combined_groups_fasta, 'r') as f:
81
98
  for line in f:
82
99
  if line.startswith('>'):
83
- if current_group is not None and (selected_groups is None or group_number in selected_groups):
100
+ if current_group is not None and (selected_group_ids is None or group_number in selected_group_ids):
84
101
  groups[current_group].append((current_group_header, ''.join(current_sequence)))
85
102
 
86
103
  current_group_header = line.strip()
@@ -91,7 +108,7 @@ def read_fasta_groups(options):
91
108
 
92
109
  # Only process if group matches the selected_groups or if no specific groups were provided
93
110
  group_number = int(current_group.replace('>Group_', '')) # Assuming format 'Group_n'
94
- if selected_groups is not None and group_number not in selected_groups:
111
+ if selected_group_ids is not None and group_number not in selected_group_ids:
95
112
  current_group = None # Skip this group
96
113
  continue
97
114
 
@@ -101,9 +118,12 @@ def read_fasta_groups(options):
101
118
  if current_group is not None:
102
119
  groups[current_group].append((current_group_header, ''.join(current_sequence)))
103
120
 
121
+
104
122
  return groups, genome_count
105
123
 
106
124
 
125
+
126
+
107
127
  def write_fasta(sequences, output_file):
108
128
  with open(output_file, 'w') as f:
109
129
  for header, seq in sequences:
@@ -145,14 +165,14 @@ def read_cd_hit_output(clustering_output):
145
165
 
146
166
  return clusters
147
167
 
148
- @profile
149
- def separate_groups(options, clustering_mode):
150
- groups, genome_count = read_fasta_groups(options)
168
+ #@profile
169
+ def separate_groups(options, clustering_mode, groups_to_use):
170
+ groups, genome_count = read_fasta_groups(options, groups_to_use)
151
171
 
152
- paralog_groups = defaultdict(int) # To track number of paralog groups
172
+ paralog_groups = defaultdict(lambda: {'count': 0, 'sizes': []}) # To track number of paralog groups and their sizes
153
173
 
154
174
  for group_header, sequences in groups.items():
155
- if options.verbose:
175
+ if options.verbose == True:
156
176
  print(f"\n###\nCurrent Group: {group_header.replace('>','')}\n")
157
177
 
158
178
  group_name = group_header.split('|')[0] # Get the group part (e.g., '>Group_n')
@@ -166,18 +186,18 @@ def separate_groups(options, clustering_mode):
166
186
  num_genomes_with_multiple_genes = sum(1 for count in genome_to_gene_count.values() if count > 1)
167
187
 
168
188
  # Check if the group meets the threshold for having paralogs
169
- if options.groups == None:
170
- if (num_genomes_with_multiple_genes / options.genome_num) * 100 < options.group_threshold:
171
- continue
189
+ #if options.groups == None:
190
+ if (num_genomes_with_multiple_genes / options.genome_num) * 100 < options.group_threshold:
191
+ continue
172
192
 
173
193
 
174
194
  group_file_name = group_name.replace('>','')
175
195
 
176
- temp_fasta = f"{options.output_dir}/{group_file_name}.fasta"
196
+ temp_fasta = f"{options.gene_groups_output}/{group_file_name}.fasta"
177
197
  write_fasta(sequences, temp_fasta)
178
198
 
179
199
  # Run cd-hit on the individual group
180
- clustering_output = f"{options.output_dir}/{group_file_name}_clustering"
200
+ clustering_output = f"{options.gene_groups_output}/{group_file_name}_clustering"
181
201
 
182
202
  run_cd_hit(options, temp_fasta, clustering_output, clustering_mode)
183
203
 
@@ -255,7 +275,7 @@ def separate_groups(options, clustering_mode):
255
275
 
256
276
  # Write each subgroup into a separate FASTA file
257
277
  if subgroup_sequences:
258
- subgroup_file = f"{options.output_dir}/{group_file_name}_subgroup_{subgroup_id}.fasta"
278
+ subgroup_file = f"{options.sub_groups_output}/{group_file_name}_subgroup_{subgroup_id}.fasta"
259
279
  write_fasta(subgroup_sequences, subgroup_file)
260
280
 
261
281
  # Remove processed sequences from the remaining list
@@ -264,7 +284,8 @@ def separate_groups(options, clustering_mode):
264
284
 
265
285
  # Increment subgroup ID for the next subgroup
266
286
  subgroup_id += 1
267
- paralog_groups[group_name] += 1 # Count this group as a paralog group
287
+ paralog_groups[group_name]['count'] += 1 # Count this group as a paralog group
288
+ paralog_groups[group_name]['sizes'].append(len(subgroup_sequences)) # Record the size of the subgroup
268
289
 
269
290
 
270
291
 
@@ -289,12 +310,13 @@ def separate_groups(options, clustering_mode):
289
310
 
290
311
  # Write out each subgroup to a separate FASTA file
291
312
  for subgroup_id, seqs in subgroup_sequences.items():
292
- subgroup_file = f"{options.output_dir}/{group_file_name}_subgroup_{subgroup_id}.fasta"
313
+ subgroup_file = f"{options.input_directory}/{group_file_name}_subgroup_{subgroup_id}.fasta"
293
314
  write_fasta(seqs, subgroup_file)
294
315
 
295
316
  # Increment subgroup ID globally for the next subgroup
296
317
  subgroup_id += 1
297
- paralog_groups[group_name] += 1 # Count this group as a paralog group
318
+ paralog_groups[group_name]['count'] += 1 # Count this group as a paralog group
319
+ paralog_groups[group_name]['sizes'].append(len(seqs)) # Record the size of the subgroup
298
320
 
299
321
 
300
322
 
@@ -307,53 +329,73 @@ def separate_groups(options, clustering_mode):
307
329
  if os.path.exists(clustering_output):
308
330
  os.remove(clustering_output)
309
331
 
310
- # Print metrics about paralog groups
311
- print(f"Identified {len(paralog_groups)} paralog groups:")
312
- for group_id, count in paralog_groups.items():
313
- print(f"Group ID: {group_id}, Number of new groups: {count}")
332
+
333
+ return paralog_groups
314
334
 
315
335
 
316
336
  def main():
317
337
  parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': Group-Splitter - A tool to split multi-copy gene groups identified by PyamilySeq.')
318
338
  ### Required Arguments
319
339
  required = parser.add_argument_group('Required Parameters')
320
- required.add_argument('-input_fasta', action='store', dest='input_fasta',
321
- help='Input FASTA file containing gene groups.',
340
+ required.add_argument('-input_directory', action='store', dest='input_directory',
341
+ help='Provide the directory of a PyamilySeq run.',
322
342
  required=True)
323
- required.add_argument('-sequence_type', action='store', dest='sequence_type', default='DNA',choices=['AA', 'DNA'],
324
- help='Default - DNA: Are groups "DNA" or "AA" sequences?',
343
+ required.add_argument('-sequence_type', action='store', dest='sequence_type', default='AA',choices=['AA', 'DNA'],
344
+ help='Default - AA: Are groups "DNA" or "AA" sequences?',
325
345
  required=True)
326
346
  required.add_argument('-genome_num', action='store', dest='genome_num', type=int,
327
347
  help='The total number of genomes must be provide',
328
348
  required=True)
329
- required.add_argument('-output_dir', action='store', dest='output_dir',
330
- help='Output directory.',
331
- required=True)
332
349
 
350
+
351
+ ### Regrouping Arguments
333
352
  regrouping_params = parser.add_argument_group('Regrouping Parameters')
334
- regrouping_params.add_argument('-groups', action="store", dest='groups', default=None,
335
- help='Default - auto: Detect groups to be split (see -group_threshold). '
336
- 'Provide "-groups 1,2,3,4" with group IDs to split specific groups.',
353
+ regrouping_params.add_argument('-groups', action="store", dest='groups', type=int, default=None,
354
+ help='Default - 99: groups to be split by pangenome grouping (see -group_threshold). '
355
+ 'Provide "-groups 99" to split specific groups.',
356
+ required=False)
357
+ regrouping_params.add_argument('-group_ids', action="store", dest='group_ids', default=None,
358
+ help='Default - None: Provide "-group_ids 1,2,3,4" to split specific groups (see -group_threshold).',
337
359
  required=False)
338
360
  regrouping_params.add_argument('-group_threshold', action='store', dest='group_threshold', type=float, default=80,
339
- help='Minimum percentage of genomes with multi-copy (default: 80.0) - Does not work with "-groups"')
361
+ help='Default: 80: Minimum percentage of genomes with multi-copy in a gene group to be split.',
362
+ required=False)
363
+
364
+ ### Output Arguments
365
+ output_args = parser.add_argument_group('Output Parameters')
366
+ output_args.add_argument('-a', action="store", dest='align_core', default=None,
367
+ help='Default - No output: SLOW! (Only works for Species mode) Output aligned and concatinated sequences of identified groups -'
368
+ 'provide group levels at which to output "-a 99,95".',
369
+ required=False)
370
+
340
371
 
372
+ ### CD-HIT Reclustering Arguments
341
373
  cdhit_params = parser.add_argument_group('CD-HIT Reclustering Parameters')
342
374
  cdhit_params.add_argument('-c', action='store', dest='pident', type=float, default=0.8,
343
375
  help='Sequence identity threshold (default: 0.8) - Probably should be higher than what was used in initial clustering.')
344
376
  cdhit_params.add_argument('-s', action='store', dest='len_diff', type=float, default=0.20,
345
377
  help="Length difference cutoff (default: 0.20) - Often the most impactful parameter to split 'multi-copy' gene groups.")
346
- cdhit_params.add_argument('-T', action='store', dest='clustering_threads', type=int, default=4,
347
- help='Number of threads for clustering (default: 4)')
378
+ cdhit_params.add_argument('-fastmode', action='store_true', dest='fast_mode',
379
+ help='Default False: Run CD-HIT with "-g 0" to speed up but reduce accuracy of clustering.',
380
+ required=False)
381
+ cdhit_params.add_argument('-T', action='store', dest='clustering_threads', type=int, default=8,
382
+ help='Number of threads for clustering (default: 8)')
348
383
  cdhit_params.add_argument('-M', action='store', dest='clustering_memory', type=int, default=2000,
349
384
  help='Memory limit in MB for clustering (default: 2000)')
350
385
 
386
+ ### MAFFT Alignment Arguments
387
+ alignment_args = parser.add_argument_group('Alignment Runtime Arguments - Optional when "-a" is provided.')
388
+
389
+ alignment_args.add_argument("-t", action="store", dest="threads", type=int, default=8,
390
+ help="Default 8: Threads to be allocated for clustering and/or alignment.",
391
+ required=False)
351
392
 
393
+ ### Misc Arguments
352
394
  misc = parser.add_argument_group("Misc Parameters")
353
395
  misc.add_argument('-no_delete_temp_files', action='store_false', dest='delete_temp_files',
354
396
  help='Default: Delete all temporary files after processing.',
355
397
  required=False)
356
- misc.add_argument("-verbose", action="store_true", dest="verbose" ,
398
+ misc.add_argument("-verbose", action="store_true", dest="verbose",
357
399
  help="Print verbose output.",
358
400
  required=False)
359
401
  misc.add_argument("-v", "--version", action="version",
@@ -366,15 +408,162 @@ def main():
366
408
 
367
409
 
368
410
 
369
- if not os.path.exists(options.output_dir):
370
- os.makedirs(options.output_dir)
371
-
372
- if options.sequence_type == 'DNA':
373
- clustering_mode = 'cd-hit-est'
411
+ ###External tool checks:
412
+ ##MAFFT
413
+ if options.align_core == True:
414
+ if is_tool_installed('mafft'):
415
+ if options.verbose == True:
416
+ print("mafft is installed. Proceeding with alignment.")
417
+ else:
418
+ exit("mafft is not installed. Please install mafft to proceed.")
419
+ ##CD-HIT
420
+
421
+ if is_tool_installed('cd-hit'):
422
+ if options.verbose == True:
423
+ print("cd-hit is installed. Proceeding with clustering.")
424
+ if options.sequence_type == 'DNA':
425
+ clustering_mode = 'cd-hit-est'
426
+ else:
427
+ clustering_mode = 'cd-hit'
428
+ if options.fast_mode == True:
429
+ options.fast_mode = 0
430
+ if options.verbose == True:
431
+ print("Running CD-HIT in fast mode.")
432
+ else:
433
+ options.fast_mode = 1
434
+ if options.verbose == True:
435
+ print("Running CD-HIT in slow mode.")
374
436
  else:
375
- clustering_mode = 'cd-hit'
437
+ exit("cd-hit is not installed. Please install cd-hit to proceed.")
438
+
439
+ ##Alignment
440
+ if options.align_core != None:
441
+ if options.groups == None and options.group_ids == None:
442
+ sys.exit('Must provide "-groups" or "-group_ids" when requesting alignment with "-a".')
443
+
444
+ ##Output Directories
445
+ gene_groups_output = os.path.join(options.input_directory, "Gene_Groups_Output")
446
+ options.gene_groups_output = gene_groups_output
447
+ sub_groups_output = os.path.join(options.input_directory, "Sub_Groups_Output")
448
+ options.sub_groups_output = sub_groups_output
449
+ if not os.path.exists(gene_groups_output):
450
+ os.makedirs(gene_groups_output)
451
+ if not os.path.exists(sub_groups_output):
452
+ os.makedirs(sub_groups_output)
453
+
454
+ ## Get Summary Stats
455
+ summary_file = os.path.join(options.input_directory, 'summary_statistics.txt')
456
+
457
+ # Save arguments to a text file
458
+ params_out = os.path.join(options.input_directory, 'Group-Splitter_params.txt')
459
+ with open(params_out, "w") as outfile:
460
+ for arg, value in vars(options).items():
461
+ outfile.write(f"{arg}: {value}\n")
462
+
463
+
464
+
465
+ ## Group Selction - FIX THIS - currently fails if either are not provided
466
+ if options.groups != None and options.group_ids != None:
467
+ sys.exit('Must provide "-group_ids" or "-groups", not both.')
468
+ elif options.group_ids != None:
469
+ groups_to_use = ('ids', options.group_ids)
470
+ elif options.groups != None:
471
+ groups_to_use = ('groups', options.groups)
472
+ else:
473
+ groups_to_use = ('groups', 99)
474
+
475
+
476
+
477
+ paralog_groups = separate_groups(options, clustering_mode, groups_to_use)
478
+ ###
479
+ # Print metrics about paralog groups
480
+ print(f"Identified {len(paralog_groups)} paralog groups:")
481
+ for group_id, data in paralog_groups.items():
482
+ print(f"Group ID: {group_id}, Number of new groups: {data['count']}, Sizes: {data['sizes']}")
483
+ ###
484
+
485
+
486
+ # Read summary statistics
487
+ with open(summary_file, 'r') as f:
488
+ summary_data = f.read().splitlines()
489
+
490
+ summary_info = {}
491
+ for line in summary_data:
492
+ if ':' in line:
493
+ key, value = line.split(':')
494
+ summary_info[key.strip()] = int(value.strip())
495
+
496
+ genome_num = summary_info['Number of Genomes']
497
+ core_99 = summary_info['First_core_99']
498
+ core_95 = summary_info['First_core_95']
499
+ core_15 = summary_info['First_core_15']
500
+ core_0 = summary_info['First_core_0']
501
+ total_gene_groups = summary_info['Total Number of First Gene Groups (Including Singletons)']
502
+
503
+ # Initialise new core values
504
+ new_core_99 = core_99
505
+ new_core_95 = core_95
506
+ new_core_15 = core_15
507
+ new_core_0 = core_0
508
+
509
+ # Recalculate each *_core_* value
510
+ for group_id, data in paralog_groups.items():
511
+ group_id = group_id.replace('>Group_', '')
512
+ original_group = next((f for f in os.listdir(gene_groups_output) if f.endswith(f'_{group_id}.fasta')), None)
513
+ original_group = int(original_group.split('_')[2])
514
+ if original_group == 99:
515
+ new_core_99 -= 1
516
+ elif original_group == 95:
517
+ new_core_95 -= 1
518
+ elif original_group == 15:
519
+ new_core_15 -= 1
520
+ elif original_group == 0:
521
+ new_core_0 -= 1
522
+
523
+ for size in data['sizes']:
524
+ if size >= math.floor(99 * genome_num / 100):
525
+ new_core_99 += 1
526
+ elif size >= math.floor(95 * genome_num / 100):
527
+ new_core_95 += 1
528
+ elif size >= math.floor(15 * genome_num / 100):
529
+ new_core_15 += 1
530
+ elif size >= math.floor(0 * genome_num / 100):
531
+ new_core_0 += 1
532
+
533
+
534
+
535
+
536
+ # Write out the new summary statistics - currently only works for default cores
537
+ stats_out = summary_file.replace('.txt','_recalculated.txt')
538
+ key_order = ['First_core_', 'extended_core_', 'combined_core_', 'Second_core_','only_Second_core_']
539
+ with open(stats_out, 'w') as outfile:
540
+ print("Number of Genomes: " + str(options.genome_num))
541
+ outfile.write("Number of Genomes: " + str(options.genome_num) + "\n")
542
+ print("Reclaculated Gene Groups:")
543
+ outfile.write("Recalculated Gene Groups\n")
544
+ print(f"First_core_99: {new_core_99}")
545
+ outfile.write(f"First_core_99: {new_core_99}\n")
546
+ print(f"First_core_95: {new_core_95}")
547
+ outfile.write(f"First_core_95: {new_core_95}\n")
548
+ print(f"First_core_15: {new_core_15}")
549
+ outfile.write(f"First_core_15: {new_core_15}\n")
550
+ print(f"First_core_0: {new_core_0}")
551
+ outfile.write(f"First_core_0: {new_core_0}\n")
552
+ print("Total Number of First Gene Groups (Including Singletons): " + str(total_gene_groups))
553
+ outfile.write("Total Number of First Gene Groups (Including Singletons): " + str(total_gene_groups))
554
+
555
+ # Alignment
556
+ if options.align_core != None:
557
+ print("\n\nProcessing gene group alignment")
558
+ group_directory = options.gene_groups_output
559
+ sub_group_directory = options.sub_groups_output
560
+ genome_list = read_genomes_from_fasta(options.gene_groups_output + '/combined_group_sequences_dna.fasta')
561
+ process_gene_groups(options, group_directory, sub_group_directory, paralog_groups, genome_list, 'concatenated_genes_post_splitting_aligned_dna.fasta')
562
+
563
+
564
+
565
+
376
566
 
377
- separate_groups(options, clustering_mode)
378
567
 
379
568
 
380
569
  if __name__ == "__main__":