PyamilySeq 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
PyamilySeq/Constants.py CHANGED
@@ -1 +1,15 @@
1
- PyamilySeq_Version = 'v0.2.0'
1
+ import subprocess
2
+
3
+ PyamilySeq_Version = 'v0.4.0'
4
+
5
+
6
+
7
+ def is_tool_installed(tool_name):
8
+ """Check if a tool is installed and available in PATH."""
9
+ try:
10
+ subprocess.run([tool_name, '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
11
+ return True
12
+ except subprocess.CalledProcessError:
13
+ return False
14
+ except FileNotFoundError:
15
+ return False
@@ -0,0 +1,186 @@
1
+ import argparse
2
+ import collections
3
+ import os
4
+ import glob
5
+ import subprocess
6
+ from PyamilySeq_Species import *
7
+
8
+
9
+ try:
10
+ from .PyamilySeq_Species import cluster
11
+ from .Constants import *
12
+ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
13
+ from PyamilySeq_Species import cluster
14
+ from Constants import *
15
+
16
+ def reverse_complement(seq):
17
+ complement = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'N': 'N'}
18
+ return ''.join(complement[base] for base in reversed(seq))
19
+
20
+
21
+ def read_separate_files(input_dir, name_split, combined_out):
22
+ with open(combined_out, 'w') as combined_out_file:
23
+ for fasta_file in glob.glob(os.path.join(input_dir, '*' + name_split)):
24
+ genome_name = os.path.basename(fasta_file).split(name_split)[0]
25
+ corresponding_gff_file = fasta_file.replace('.fasta', '.gff')
26
+ if not os.path.exists(corresponding_gff_file):
27
+ continue
28
+ cds_sequences = extract_cds_from_gff(fasta_file, corresponding_gff_file)
29
+ for gene_name, seq in cds_sequences:
30
+ header = f">{genome_name}_{gene_name}\n"
31
+ combined_out_file.write(header)
32
+ combined_out_file.write(seq + '\n')
33
+
34
+ def read_combined_files(input_dir, name_split, combined_out):
35
+ with open(combined_out, 'w') as combined_out_file:
36
+ for gff_file in glob.glob(os.path.join(input_dir, '*' + name_split)):
37
+ genome_name = os.path.basename(gff_file).split(name_split)[0]
38
+ fasta_dict = collections.defaultdict(str)
39
+ gff_features = []
40
+ with open(gff_file, 'r') as file:
41
+ lines = file.readlines()
42
+ fasta_section = False
43
+ for line in lines:
44
+ if line.startswith('##FASTA'):
45
+ fasta_section = True
46
+ continue
47
+ if fasta_section:
48
+ if line.startswith('>'):
49
+ current_contig = line[1:].split()[0]
50
+ fasta_dict[current_contig] = []
51
+ else:
52
+ fasta_dict[current_contig].append(line.strip())
53
+ else:
54
+ line_data = line.split('\t')
55
+ if len(line_data) == 9:
56
+ if line_data[2] == 'CDS':
57
+ contig = line_data[0]
58
+ feature = line_data[2]
59
+ start, end = int(line_data[3]), int(line_data[4])
60
+ seq_id = line_data[8].split('ID=')[1].split(';')[0]
61
+ gff_features.append((contig, start, end, seq_id))
62
+
63
+ if fasta_dict and gff_features:
64
+ for contig, start, end, seq_id in gff_features:
65
+ if contig in fasta_dict:
66
+ full_sequence = ''.join(fasta_dict[contig])
67
+ cds_sequence = full_sequence[start - 1:end]
68
+ wrapped_sequence = '\n'.join([cds_sequence[i:i + 60] for i in range(0, len(cds_sequence), 60)])
69
+ combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
70
+
71
+
72
+ def run_cd_hit(input_file, clustering_output, options):
73
+ cdhit_command = [
74
+ 'cd-hit-est',
75
+ '-i', input_file,
76
+ '-o', clustering_output,
77
+ '-c', str(options.pident),
78
+ '-s', str(options.len_diff),
79
+ '-T', "20",
80
+ '-d', "0",
81
+ '-sc', "1",
82
+ '-sf', "1"
83
+ ]
84
+ subprocess.run(cdhit_command)
85
+
86
+
87
+
88
+
89
+
90
+
91
+
92
+
93
+ def main():
94
+ parser = argparse.ArgumentParser(
95
+ description='PyamilySeq ' + PyamilySeq_Version + ': PyamilySeq Run Parameters.')
96
+ required = parser.add_argument_group('Required Arguments')
97
+ required.add_argument("-id", action="store", dest="input_dir",
98
+ help="Directory containing GFF/FASTA files.",
99
+ required=True)
100
+ required.add_argument("-od", action="store", dest="output_dir",
101
+ help="Directory for all output files.",
102
+ required=True)
103
+ required.add_argument("-it", action="store", dest="input_type", choices=['separate', 'combined'],
104
+ help="Type of input files: 'separate' for separate FASTA and GFF files,"
105
+ " 'combined' for GFF files with embedded FASTA sequences.",
106
+ required=True)
107
+ required.add_argument("-ns", action="store", dest="name_split",
108
+ help="Character used to split the filename and extract the genome name.",
109
+ required=True)
110
+ required.add_argument("-pid", action="store", dest="pident", type=float,
111
+ help="Pident threshold for CD-HIT clustering.",
112
+ required=True)
113
+ required.add_argument("-ld", action="store", dest="len_diff", type=float,
114
+ help="Length difference (-s) threshold for CD-HIT clustering.",
115
+ required=True)
116
+ required.add_argument("-co", action="store", dest="clustering_out",
117
+ help="Output file for initial clustering.",
118
+ required=True)
119
+ required.add_argument("-ct", action="store", dest="clustering_type", choices=['CD-HIT', 'BLAST', 'DIAMOND', "MMseqs2"],
120
+ help="Clustering format for PyamilySeq.",
121
+ required=True)
122
+
123
+ output_args = parser.add_argument_group('Output Parameters')
124
+ output_args.add_argument('-w', action="store", dest='write_families', default=None,
125
+ help='Default - No output: Output sequences of identified families (provide levels at which to output "-w 99,95"'
126
+ ' - Must provide FASTA file with -fasta',
127
+ required=False)
128
+ output_args.add_argument('-con', action="store", dest='con_core', default=None,
129
+ help='Default - No output: Output aligned and concatinated sequences of identified families - used for MSA (provide levels at which to output "-w 99,95"'
130
+ ' - Must provide FASTA file with -fasta',
131
+ required=False)
132
+ output_args.add_argument('-fasta', action='store', dest='fasta',
133
+ help='FASTA file to use in conjunction with "-w" or "-con"',
134
+ required=False)
135
+
136
+ optional = parser.add_argument_group('Optional Arguments')
137
+ optional.add_argument('-rc', action='store', dest='reclustered', help='Clustering output file from secondary round of clustering',
138
+ required=False)
139
+ optional.add_argument('-st', action='store', dest='sequence_tag', help='Default - "StORF": Unique identifier to be used to distinguish the second of two rounds of clustered sequences',
140
+ required=False)
141
+ optional.add_argument('-groups', action="store", dest='core_groups', default="99,95,15",
142
+ help='Default - (\'99,95,15\'): Gene family groups to use')
143
+ optional.add_argument('-gpa', action='store', dest='gene_presence_absence_out', help='Default - False: If selected, a Roary formatted gene_presence_absence.csv will be created - Required for Coinfinder and other downstream tools',
144
+ required=False)
145
+
146
+ parser.add_argument("pyamilyseq_args", nargs=argparse.REMAINDER, help="Additional arguments for PyamilySeq.")
147
+ options = parser.parse_args()
148
+
149
+
150
+
151
+ output_path = os.path.abspath(options.output_dir)
152
+ combined_out_file = os.path.join(output_path,"end_to_end_combined_sequences.fasta")
153
+ clustering_output = os.path.join(output_path,'clustering_'+options.clustering_type)
154
+
155
+
156
+
157
+ # Step 1: Read and rename sequences from files based on input type
158
+ if options.input_type == 'separate':
159
+ read_separate_files(options.input_dir, options.name_split, combined_out_file)
160
+ else:
161
+ read_combined_files(options.input_dir, options.name_split, combined_out_file)
162
+
163
+ # Step 2: Run CD-HIT on the renamed sequences
164
+ run_cd_hit(combined_out_file, clustering_output, options)
165
+
166
+
167
+ class clustering_options:
168
+ def __init__(self):
169
+ self.format = 'CD-HIT'
170
+ self.reclustered = options.reclustered
171
+ self.sequence_tag = 'StORF'
172
+ self.core_groups = '99,95,15,0'
173
+ self.clusters = clustering_output+'.clstr'
174
+ self.gene_presence_absence_out = options.gene_presence_absence_out
175
+ self.write_families = options.write_families
176
+ self.con_core = options.con_core
177
+
178
+ clustering_options = clustering_options()
179
+
180
+ # Step 3: Run PyamilySeq with the CD-HIT output
181
+ cluster(clustering_options)
182
+ #run_pyamilyseq(options.clustering_out, options.clustering_type, combined_out_file, options.pyamilyseq_args)
183
+
184
+
185
+ if __name__ == "__main__":
186
+ main()
@@ -6,6 +6,9 @@ import math
6
6
  import sys
7
7
  import argparse
8
8
  import os
9
+ from tempfile import NamedTemporaryFile
10
+
11
+
9
12
 
10
13
  try:
11
14
  from .Constants import *
@@ -20,6 +23,75 @@ def sort_keys_by_values(dict1, dict2):
20
23
  sorted_keys = sorted(dict1.keys(), key=lambda k: custom_sort_key(k, dict1, dict2), reverse=True)
21
24
  return sorted_keys
22
25
 
26
+ def select_longest_gene(sequences):
27
+ """Select the longest sequence for each genome."""
28
+ longest_sequences = {}
29
+ for seq_id, sequence in sequences.items():
30
+ genome = seq_id.split('|')[0] # Assuming genome name can be derived from the sequence ID
31
+ if genome not in longest_sequences or len(sequence) > len(longest_sequences[genome][1]):
32
+ longest_sequences[genome] = (seq_id, sequence)
33
+ return longest_sequences
34
+
35
+
36
+ def run_mafft_on_sequences(sequences, output_file):
37
+ """Run mafft on the given sequences and write to output file."""
38
+ # Create a temporary input file for mafft
39
+ with NamedTemporaryFile('w', delete=False) as temp_input_file:
40
+ for header, sequence in sequences.items():
41
+ temp_input_file.write(f">{header}\n{sequence}\n")
42
+ temp_input_file_path = temp_input_file.name
43
+
44
+ # Run mafft
45
+ try:
46
+ with open(output_file, 'w') as output_f:
47
+ subprocess.run(
48
+ ['mafft', '--auto', temp_input_file_path],
49
+ stdout=output_f,
50
+ stderr=subprocess.DEVNULL, # Suppress stderr
51
+ check=True
52
+ )
53
+ finally:
54
+ os.remove(temp_input_file_path) # Clean up the temporary file
55
+
56
+
57
+ def process_gene_families(directory, output_file):
58
+ """Process each gene family file to select the longest sequence per genome and concatenate aligned sequences."""
59
+ concatenated_sequences = {}
60
+ output_file = directory.replace('Gene_Families_Output',output_file)
61
+
62
+ # Iterate over each gene family file
63
+ for gene_file in os.listdir(directory):
64
+ if gene_file.endswith('.fasta'):
65
+ gene_path = os.path.join(directory, gene_file)
66
+
67
+ # Read sequences from the gene family file
68
+ sequences = read_fasta(gene_path)
69
+
70
+ # Select the longest sequence for each genome
71
+ longest_sequences = select_longest_gene(sequences)
72
+
73
+ # Run mafft on the longest sequences
74
+ aligned_file = f"{gene_file}_aligned.fasta"
75
+ run_mafft_on_sequences({seq_id: seq for seq_id, seq in longest_sequences.values()}, aligned_file)
76
+
77
+ # Read aligned sequences and concatenate them
78
+ aligned_sequences = read_fasta(aligned_file)
79
+ for genome, aligned_seq in aligned_sequences.items():
80
+ genome_name = genome.split('|')[0]
81
+ if genome_name not in concatenated_sequences:
82
+ concatenated_sequences[genome_name] = ""
83
+ concatenated_sequences[genome_name] += aligned_seq
84
+
85
+ # Clean up aligned file
86
+ os.remove(aligned_file)
87
+
88
+ # Write the concatenated sequences to the output file
89
+ with open(output_file, 'w') as out:
90
+ for genome, sequence in concatenated_sequences.items():
91
+ out.write(f">{genome}\n")
92
+ wrapped_sequence = wrap_sequence(sequence, 60)
93
+ out.write(f"{wrapped_sequence}\n")
94
+
23
95
  def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
24
96
  print("Outputting gene_presence_absence file")
25
97
  in_name = options.clusters.split('.')[0]
@@ -92,7 +164,11 @@ def get_cores(options,genome_dict):
92
164
  for group in options.core_groups.split(','):
93
165
  calculated_floor = math.floor(int(group) / 100 * len(genome_dict))
94
166
  if first == False:
95
- groups[group] = (calculated_floor,prev_top -1)
167
+ # Ensure no overlap
168
+ # if calculated_floor <= prev_top:
169
+ # calculated_floor = prev_top - 1
170
+
171
+ groups[group] = (calculated_floor,prev_top)
96
172
  else:
97
173
  groups[group] = (calculated_floor, prev_top)
98
174
  first = False
@@ -209,28 +285,28 @@ def combined_clustering_counting(options, pangenome_clusters_First, reps, combin
209
285
 
210
286
  #@profile
211
287
  def single_clustering_counting(options, pangenome_clusters_First, reps):
212
- num_clustered_PEP = defaultdict(list)
213
- recorded_PEP = []
288
+ num_clustered_First = defaultdict(list)
289
+ recorded_First = []
214
290
  pangenome_clusters_Type = copy.deepcopy(pangenome_clusters_First)
215
291
  list_of_reps = list(reps.keys())
216
- for cluster, pep_genomes in pangenome_clusters_First.items():
292
+ for cluster, First_genomes in pangenome_clusters_First.items():
217
293
  rep = list_of_reps[int(cluster)] # get the rep of the current pep cluster
218
294
 
219
295
  try: # get the cluster from the storf clusters which contains this rep
220
- num_clustered_PEP[cluster].append(rep + '_' + str(len(pep_genomes)))
221
- size_of_pep_clusters = []
222
- peps = num_clustered_PEP[cluster]
223
- for pep in peps:
224
- pep = pep.rsplit('_', 1)
225
- size_of_pep_clusters.append(int(pep[1]))
226
- recorded_PEP.append(pep[0])
227
- pangenome_clusters_Type[cluster] = [len(num_clustered_PEP[cluster]), sum(size_of_pep_clusters),
228
- size_of_pep_clusters, 0, 0, 0]
296
+ num_clustered_First[cluster].append(rep + '_' + str(len(First_genomes)))
297
+ size_of_First_clusters = []
298
+ Firsts = num_clustered_First[cluster]
299
+ for First in Firsts:
300
+ First = First.rsplit('_', 1)
301
+ size_of_First_clusters.append(int(First[1]))
302
+ recorded_First.append(First[0])
303
+ pangenome_clusters_Type[cluster] = [len(num_clustered_First[cluster]), sum(size_of_First_clusters),
304
+ size_of_First_clusters, 0, 0, 0]
229
305
 
230
306
  except KeyError:
231
307
  ###Singleton
232
- num_pep_genomes = [len(pep_genomes)]
233
- pangenome_clusters_Type[cluster] = [1, len(pep_genomes), num_pep_genomes, 0, 0, 0]
308
+ num_pep_genomes = [len(First_genomes)]
309
+ pangenome_clusters_Type[cluster] = [1, len(First_genomes), num_pep_genomes, 0, 0, 0]
234
310
 
235
311
  return pangenome_clusters_Type
236
312
 
@@ -493,7 +569,7 @@ def cluster(options):
493
569
  pangenome_clusters_Type = single_clustering_counting(options, pangenome_clusters_First, reps)
494
570
 
495
571
 
496
- counter = 0
572
+
497
573
  Number_Of_StORF_Extending_But_Same_Genomes = 0
498
574
 
499
575
  sorted_first_keys = sort_keys_by_values(pangenome_clusters_First, pangenome_clusters_First_sequences)
@@ -504,12 +580,12 @@ def cluster(options):
504
580
  print("Calculating Groups")
505
581
  for cluster, numbers in pangenome_clusters_Type_sorted.items():
506
582
  ############################### Calculate First only
507
- if numbers[0] == 1 and numbers[1] >=2:
508
- calc_First_only_core(cluster, numbers[1],groups,cores)
509
- counter +=1
510
- elif numbers[0] >1 and numbers[1] >=2:
511
- calc_First_only_core(cluster, numbers[2][0],groups,cores)
512
- counter += 1
583
+ #if numbers[0] == 1 and numbers[1] >=2:
584
+ calc_First_only_core(cluster, numbers[1],groups,cores)
585
+
586
+ # elif numbers[0] >1 and numbers[1] >=2:
587
+ # calc_First_only_core(cluster, numbers[2][0],groups,cores)
588
+
513
589
 
514
590
  if options.reclustered != None:
515
591
  ############################# Calculate First and Reclustered-Second
@@ -532,13 +608,13 @@ def cluster(options):
532
608
  if data[1] >= 2:
533
609
  calc_only_Second_only_core(groups, cores, data[1])
534
610
  ###########################
535
- print("End")
536
611
  key_order = ['first_core_', 'extended_core_', 'combined_core_', 'second_core_','only_second_core_']
537
- print("Gene Family Groups:")
612
+ print("Gene Groups:")
538
613
  for key_prefix in key_order:
539
614
  for key, value in cores.items():
540
615
  if key.startswith(key_prefix):
541
616
  print(f"{key}: {len(value)}")
617
+ print("Total Number of Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
542
618
 
543
619
  if options.gene_presence_absence_out != None:
544
620
  gene_presence_absence_output(options,genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted)
@@ -566,25 +642,107 @@ def cluster(options):
566
642
  wrapped_sequence = wrap_sequence(sequences[header])
567
643
  outfile.write(f"{wrapped_sequence}\n")
568
644
 
645
+ if options.con_core != None and options.fasta != None and options.write_families != None:
646
+ process_gene_families(os.path.join(input_dir, 'Gene_Families_Output'), 'concatonated_genes_aligned.fasta')
647
+
648
+
649
+ # groups_dir = os.path.join(input_dir, 'Gene_Families_Output')
650
+ # """Run mafft on all .fasta files in the given directory."""
651
+ # for filename in os.listdir(groups_dir):
652
+ # if filename.endswith('.fasta'):
653
+ # input_path = os.path.join(groups_dir, filename)
654
+ # output_filename = filename.replace('.fasta', '_mafft.aln')
655
+ # output_path = os.path.join(groups_dir, output_filename)
656
+ #
657
+ # # Call mafft command
658
+ # try:
659
+ # with open(output_path, 'w') as output_file:
660
+ # subprocess.run(
661
+ # ['mafft', '--auto', input_path],
662
+ # stdout=output_file,
663
+ # stderr=subprocess.DEVNULL, # Suppress stderr
664
+ # check=True
665
+ # )
666
+ # print(f"Processed {input_path} -> {output_path}")
667
+ # except subprocess.CalledProcessError as e:
668
+ # print(f"Failed to process {input_path}: {e}")
669
+
670
+ ##This could be run once and not above AND here..
671
+ # output_dir = os.path.dirname(os.path.abspath(options.clusters))
672
+ # sequences = read_fasta(options.fasta)
673
+ # concatenated_sequences = {genome: '' for genome in genome_dict.keys()}
674
+ #
675
+ #
676
+ # for key_prefix in key_order:
677
+ # for key, values in cores.items():
678
+ # if any(part in options.con_core.split(',') for part in key.split('_')):
679
+ # if key.startswith(key_prefix):
680
+ # for value in values:
681
+ # length_capture = {genome: [] for genome in genome_dict.keys()}
682
+ # sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
683
+ # for header in sequences_to_write:
684
+ # if header in sequences:
685
+ # length_capture[header.split('|')[0]].append([header,len(sequences[header])])
686
+ # if all(bool(values) for values in length_capture.values()): # If a GF is not present in 'ALL' genomes, do not add to concat
687
+ # for genome, lengths in length_capture.items():
688
+ # max_value = float('-inf')
689
+ # max_item = None
690
+ # for length in lengths:
691
+ # current_value = length[1]
692
+ # if current_value > max_value:
693
+ # max_value = current_value
694
+ # max_item = length[0]
695
+ # concatenated_sequences[genome.split('|')[0]] += sequences[max_item]
696
+ #
697
+ #
698
+ # with open(os.path.join(output_dir, 'core_concat.fasta'), 'w') as outfile:
699
+ # for genome, sequence in concatenated_sequences.items():
700
+ # outfile.write(f">{genome}\n")
701
+ # wrapped_sequence = wrap_sequence(sequence)
702
+ # outfile.write(f"{wrapped_sequence}\n")
703
+
704
+
705
+ # for core_gene_family in core_gene_families:
706
+ # found_sequences = {genome: False for genome in genomes}
707
+ #
708
+ # for fasta_file in fasta_files:
709
+ # sequences = read_fasta(fasta_file)
710
+ # for header, sequence in sequences.items():
711
+ # genome = header.split('|')[0]
712
+ # if genome in genomes and core_gene_family in header:
713
+ # concatenated_sequences[genome] += sequence
714
+ # found_sequences[genome] = True
715
+ #
716
+ # for genome in genomes:
717
+ # if not found_sequences[genome]:
718
+ # concatenated_sequences[genome] += '-' * len(next(iter(sequences.values())))
719
+
720
+
569
721
 
570
722
 
571
723
  def main():
572
724
 
573
- parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': PyamilySeq Run Parameters.')
725
+ parser = argparse.ArgumentParser(description='PyamilySeq-Species ' + PyamilySeq_Version + ': PyamilySeq-Species Run Parameters.')
574
726
  parser._action_groups.pop()
575
727
 
576
728
  required = parser.add_argument_group('Required Arguments')
577
729
  required.add_argument('-c', action='store', dest='clusters', help='Clustering output file from CD-HIT, TSV or CSV Edge List',
578
730
  required=True)
579
731
  required.add_argument('-f', action='store', dest='format', choices=['CD-HIT', 'CSV', 'TSV'],
580
- help='Which format to use (CD-HIT or Comma/Tab Separated Edge-List (such as MMseqs2 tsv output))', required=True)
732
+ help='Which format to use (CD-HIT or Comma/Tab Separated Edge-List (such as MMseqs2 tsv output))',
733
+ required=True)
581
734
 
582
735
  output_args = parser.add_argument_group('Output Parameters')
583
- output_args.add_argument('-w', action="store", dest='write_families', default="99",
584
- help='Default - No output: Output sequences of identified families (provide levels at which to output "-w 99 95"'
585
- ' - Must provide FASTA file with -fasta')
736
+ output_args.add_argument('-w', action="store", dest='write_families', default=None,
737
+ help='Default - No output: Output sequences of identified families (provide levels at which to output "-w 99,95"'
738
+ ' - Must provide FASTA file with -fasta',
739
+ required=False)
740
+ output_args.add_argument('-con', action="store", dest='con_core', default=None,
741
+ help='Default - No output: Output aligned and concatinated sequences of identified families - used for MSA (provide levels at which to output "-w 99,95"'
742
+ ' - Must provide FASTA file with -fasta',
743
+ required=False)
586
744
  output_args.add_argument('-fasta', action='store', dest='fasta',
587
- help='FASTA file to use in conjunction with "-w"',
745
+ help='FASTA file to use in conjunction with "-w" or "-con"',
588
746
  required=False)
589
747
 
590
748
  optional = parser.add_argument_group('Optional Arguments')
@@ -592,16 +750,18 @@ def main():
592
750
  required=False)
593
751
  optional.add_argument('-st', action='store', dest='sequence_tag', help='Default - "StORF": Unique identifier to be used to distinguish the second of two rounds of clustered sequences',
594
752
  required=False)
595
- optional.add_argument('-groups', action="store", dest='core_groups', default="99,95,90,80,15",
596
- help='Default - (\'99,95,90,80,15\'): Gene family groups to use')
753
+ optional.add_argument('-groups', action="store", dest='core_groups', default="99,95,15",
754
+ help='Default - (\'99,95,15\'): Gene family groups to use')
597
755
  optional.add_argument('-gpa', action='store', dest='gene_presence_absence_out', help='Default - False: If selected, a Roary formatted gene_presence_absence.csv will be created - Required for Coinfinder and other downstream tools',
598
756
  required=False)
599
757
 
600
758
  misc = parser.add_argument_group('Misc')
601
759
  misc.add_argument('-verbose', action='store', dest='verbose', default=False, type=eval, choices=[True, False],
602
- help='Default - False: Print out runtime messages')
760
+ help='Default - False: Print out runtime messages',
761
+ required = False)
603
762
  misc.add_argument('-v', action='store_true', dest='version',
604
- help='Default - False: Print out version number and exit')
763
+ help='Default - False: Print out version number and exit',
764
+ required=False)
605
765
 
606
766
 
607
767
  options = parser.parse_args()
@@ -614,6 +774,11 @@ def main():
614
774
  if options.sequence_tag == None:
615
775
  options.sequence_tag = 'StORF'
616
776
 
777
+ if options.con_core == True:
778
+ if is_tool_installed('mafft'):
779
+ print("mafft is installed. Proceeding with alignment.")
780
+ else:
781
+ print("mafft is not installed. Please install mafft to proceed.")
617
782
 
618
783
  if options.write_families != None and options.fasta == False:
619
784
  exit("-fasta must br provided if -w is used")
@@ -643,5 +808,5 @@ def main():
643
808
 
644
809
  if __name__ == "__main__":
645
810
  main()
646
- print("Complete")
811
+ print("Done")
647
812
 
@@ -0,0 +1,92 @@
1
+ Metadata-Version: 2.1
2
+ Name: PyamilySeq
3
+ Version: 0.4.0
4
+ Summary: PyamilySeq - A a tool to look for sequence-based gene families identified by clustering methods such as CD-HIT, DIAMOND, BLAST or MMseqs2.
5
+ Home-page: https://github.com/NickJD/PyamilySeq
6
+ Author: Nicholas Dimonaco
7
+ Author-email: nicholas@dimonaco.co.uk
8
+ Project-URL: Bug Tracker, https://github.com/NickJD/PyamilySeq/issues
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.6
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+
16
+ # PyamilySeq - !BETA!
17
+ **PyamilySeq** (Family Seek) is a Python tool for clustering gene sequences into families based on sequence similarity identified by tools such as CD-HIT, BLAST, DIAMOND or MMseqs2.
18
+ This work is an extension of the gene family / pangenome tool developed for the StORF-Reporter publication in NAR (https://doi.org/10.1093/nar/gkad814).
19
+
20
+ ## Features
21
+ - **End-to-End**: PyamilySeq can take a directory of GFF+FASTA files, run CD-HIT for clustering and process the results.
22
+ - **Clustering**: Supports input from CD-HIT formatted files as well as CSV and TSV edge lists (-outfmt 6 from BLAST/DIAMOND).
23
+ - **Reclustering**: Allows for the addition of new sequences post-initial clustering.
24
+ - **Output**: Generates a gene 'Roary/Panaroo' formatted presence-absence CSV formatted file for downstream analysis.
25
+ - Align representative sequences using MAFFT.
26
+ - Output concatenated aligned sequences for downstream analysis.
27
+ - Optionally output sequences of identified families.
28
+
29
+
30
+ ### Installation
31
+ PyamilySeq requires Python 3.6 or higher. Install using pip:
32
+
33
+ ```bash
34
+ pip install PyamilySeq
35
+ ```
36
+
37
+ ## Usage - Menu
38
+ ```
39
+ usage: PyamilySeq.py [-h] -id INPUT_DIR -od OUTPUT_DIR -it {separate,combined} -ns NAME_SPLIT -pid PIDENT -ld LEN_DIFF -co CLUSTERING_OUT -ct {CD-HIT,BLAST,DIAMOND,MMseqs2} [-w WRITE_FAMILIES] [-con CON_CORE] [-fasta FASTA] [-rc RECLUSTERED] [-st SEQUENCE_TAG] [-groups CORE_GROUPS]
40
+ [-gpa GENE_PRESENCE_ABSENCE_OUT]
41
+ ...
42
+
43
+ PyamilySeq v0.4.0: PyamilySeq Run Parameters.
44
+
45
+ positional arguments:
46
+ pyamilyseq_args Additional arguments for PyamilySeq.
47
+
48
+ options:
49
+ -h, --help show this help message and exit
50
+
51
+ Required Arguments:
52
+ -id INPUT_DIR Directory containing GFF/FASTA files.
53
+ -od OUTPUT_DIR Directory for all output files.
54
+ -it {separate,combined}
55
+ Type of input files: 'separate' for separate FASTA and GFF files, 'combined' for GFF files with embedded FASTA sequences.
56
+ -ns NAME_SPLIT Character used to split the filename and extract the genome name.
57
+ -pid PIDENT Pident threshold for CD-HIT clustering.
58
+ -ld LEN_DIFF Length difference (-s) threshold for CD-HIT clustering.
59
+ -co CLUSTERING_OUT Output file for initial clustering.
60
+ -ct {CD-HIT,BLAST,DIAMOND,MMseqs2}
61
+ Clustering format for PyamilySeq.
62
+
63
+ Output Parameters:
64
+ -w WRITE_FAMILIES Default - No output: Output sequences of identified families (provide levels at which to output "-w 99,95" - Must provide FASTA file with -fasta
65
+ -con CON_CORE Default - No output: Output aligned and concatinated sequences of identified families - used for MSA (provide levels at which to output "-w 99,95" - Must provide FASTA file with -fasta
66
+ -fasta FASTA FASTA file to use in conjunction with "-w" or "-con"
67
+
68
+ Optional Arguments:
69
+ -rc RECLUSTERED Clustering output file from secondary round of clustering
70
+ -st SEQUENCE_TAG Default - "StORF": Unique identifier to be used to distinguish the second of two rounds of clustered sequences
71
+ -groups CORE_GROUPS Default - ('99,95,15'): Gene family groups to use
72
+ -gpa GENE_PRESENCE_ABSENCE_OUT
73
+ Default - False: If selected, a Roary formatted gene_presence_absence.csv will be created - Required for Coinfinder and other downstream tools
74
+ ```
75
+
76
+ ### Example Run End-to-End - 'genomes' is a test-directory containing GFF files with ##FASTA at the bottom
77
+
78
+ ```bash
79
+ PyamilySeq -id .../genomes -it combined -ns _combined.gff3 -pid 0.90 -ld 0.60 -co testing_cd-hit -ct CD-HIT -od .../testing
80
+ ```
81
+
82
+ ```Calculating Groups
83
+ Calculating Groups
84
+ Gene Groups:
85
+ first_core_99: 3103
86
+ first_core_95: 0
87
+ first_core_15: 3217
88
+ first_core_0: 4808
89
+ Total Number of Gene Groups (Including Singletons): 11128
90
+ ```
91
+
92
+
@@ -0,0 +1,12 @@
1
+ PyamilySeq/CD-Hit_StORF-Reporter_Cross-Genera_Builder.py,sha256=UzQ5iOKCNfurxmj1pnkowF11YfWBO5vnBCKxQK6goB8,26538
2
+ PyamilySeq/Constants.py,sha256=971sO5fjptv27yRtg595ex8VuNURb2Nh4mFSdGx6HJ4,399
3
+ PyamilySeq/PyamilySeq.py,sha256=Zy84pSBXY9EnMmk30SrfbQr9-SWYJ4rPHb9xbV3L9lU,8971
4
+ PyamilySeq/PyamilySeq_Species.py,sha256=kTXeCgplHfCglii_g099zdt2iy0lc5wDX3k4HuSaIgo,39167
5
+ PyamilySeq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ PyamilySeq/combine_FASTA_with_genome_IDs.py,sha256=aMUVSk6jKnKX0g04RMM360QueZS83lRLqLLysBtQbLo,2009
7
+ PyamilySeq-0.4.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
8
+ PyamilySeq-0.4.0.dist-info/METADATA,sha256=d0goQEGZZz_q6_sZUwoPr-h7FR-Ad7WmupIJuK8MTFc,4462
9
+ PyamilySeq-0.4.0.dist-info/WHEEL,sha256=rWxmBtp7hEUqVLOnTaDOPpR-cZpCDkzhhcBce-Zyd5k,91
10
+ PyamilySeq-0.4.0.dist-info/entry_points.txt,sha256=aEpNchWXaSR7_hGQqXYGtvXz14FgIcfFdXESpEhsvXg,58
11
+ PyamilySeq-0.4.0.dist-info/top_level.txt,sha256=J6JhugUQTq4rq96yibAlQu3o4KCM9WuYfqr3w1r119M,11
12
+ PyamilySeq-0.4.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (70.3.0)
2
+ Generator: setuptools (71.0.4)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ PyamilySeq = PyamilySeq.PyamilySeq:main
@@ -1,101 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: PyamilySeq
3
- Version: 0.2.0
4
- Summary: PyamilySeq - A a tool to look for sequence-based gene families identified by clustering methods such as CD-HIT, DIAMOND, BLAST or MMseqs2.
5
- Home-page: https://github.com/NickJD/PyamilySeq
6
- Author: Nicholas Dimonaco
7
- Author-email: nicholas@dimonaco.co.uk
8
- Project-URL: Bug Tracker, https://github.com/NickJD/PyamilySeq/issues
9
- Classifier: Programming Language :: Python :: 3
10
- Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
11
- Classifier: Operating System :: OS Independent
12
- Requires-Python: >=3.6
13
- Description-Content-Type: text/markdown
14
- License-File: LICENSE
15
-
16
- # PyamilySeq
17
- PyamilySeq (Family Seek) is a Python tool for clustering gene sequences into families based on sequence similarity identified by tools such as CD-HIT, DIAMOND or MMseqs2.
18
- This work is an extension of the gene family / pangenome tool developed for the StORF-Reporter publication in NAR (https://doi.org/10.1093/nar/gkad814).
19
-
20
- ## Features
21
-
22
- - **Clustering**: Supports input from CD-HIT formatted files as well as TSV and CSV Edge List formats.
23
- - **Reclustering**: Allows for the addition of new sequences post-initial clustering.
24
- - **Output**: Generates a gene 'Roary' presence-absence CSV formatted file for downstream analysis.
25
-
26
- ## Installation
27
-
28
- PyamilySeq requires Python 3.6 or higher. Install dependencies using pip:
29
-
30
- ```bash
31
- pip install PyamilySeq
32
- ```
33
-
34
- ## Usage - Menu
35
- ```
36
- PyamilySeq_Species.py -h
37
- usage: PyamilySeq_Species.py [-h] -c CLUSTERS -f {CD-HIT,CSV,TSV} [-w WRITE_FAMILIES] [-fasta FASTA] [-rc RECLUSTERED] [-st SEQUENCE_TAG]
38
- [-groups CORE_GROUPS] [-gpa GENE_PRESENCE_ABSENCE_OUT] [-verbose {True,False}] [-v]
39
-
40
- PyamilySeq v0.2.0: PyamilySeq Run Parameters.
41
-
42
- Required Arguments:
43
- -c CLUSTERS Clustering output file from CD-HIT, TSV or CSV Edge List
44
- -f {CD-HIT,CSV,TSV} Which format to use (CD-HIT or Comma/Tab Separated Edge-List (such as MMseqs2 tsv output))
45
-
46
- Output Parameters:
47
- -w WRITE_FAMILIES Default - No output: Output sequences of identified families (provide levels at which to output "-w 99 95" - Must provide
48
- FASTA file with -fasta
49
- -fasta FASTA FASTA file to use in conjunction with "-w"
50
-
51
- Optional Arguments:
52
- -rc RECLUSTERED Clustering output file from secondary round of clustering
53
- -st SEQUENCE_TAG Default - "StORF": Unique identifier to be used to distinguish the second of two rounds of clustered sequences
54
- -groups CORE_GROUPS Default - ('99,95,90,80,15'): Gene family groups to use
55
- -gpa GENE_PRESENCE_ABSENCE_OUT
56
- Default - False: If selected, a Roary formatted gene_presence_absence.csv will be created - Required for Coinfinder and other
57
- downstream tools
58
-
59
- Misc:
60
- -verbose {True,False}
61
- Default - False: Print out runtime messages
62
- -v Default - False: Print out version number and exit
63
-
64
- ```
65
-
66
- ### Clustering Analysis
67
-
68
- To perform clustering analysis:
69
-
70
- ```bash
71
- python pyamilyseq.py -c clusters_file -f format
72
- ```
73
-
74
- Replace `clusters_file` with the path to your clustering output file and `format` with one of: `CD-HIT`, `CSV`, or `TSV`.
75
-
76
- ### Reclustering
77
-
78
- To add new sequences and recluster:
79
-
80
- ```bash
81
- PyamilySeq -c clusters_file -f format --reclustered reclustered_file
82
- ```
83
-
84
- Replace `reclustered_file` with the path to the file containing additional sequences.
85
-
86
- ## Output
87
-
88
- PyamilySeq generates various outputs, including:
89
-
90
- - **Gene Presence-Absence File**: This CSV file details the presence and absence of genes across genomes.
91
- - **FASTA Files for Each Gene Family**:
92
-
93
- ## Gene Family Groups
94
-
95
- After analysis, PyamilySeq categorizes gene families into several groups:
96
-
97
- - **First Core**: Gene families present in all analysed genomes initially.
98
- - **Extended Core**: Gene families extended with additional sequences.
99
- - **Combined Core**: Gene families combined with both initial and additional sequences.
100
- - **Second Core**: Gene families identified only in the additional sequences.
101
- - **Only Second Core**: Gene families exclusively found in the additional sequences.
@@ -1,11 +0,0 @@
1
- PyamilySeq/CD-Hit_StORF-Reporter_Cross-Genera_Builder.py,sha256=UzQ5iOKCNfurxmj1pnkowF11YfWBO5vnBCKxQK6goB8,26538
2
- PyamilySeq/Constants.py,sha256=3Nr6JfUVt2eZT4M7fV-sz_bPXIvPgxIBT5nR76kCPIo,30
3
- PyamilySeq/PyamilySeq_Species.py,sha256=SCWeK7bEfnKLrfzliiOx7Jtmie8vvAXGtQE_PpJD5hY,31040
4
- PyamilySeq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- PyamilySeq/combine_FASTA_with_genome_IDs.py,sha256=aMUVSk6jKnKX0g04RMM360QueZS83lRLqLLysBtQbLo,2009
6
- PyamilySeq-0.2.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
7
- PyamilySeq-0.2.0.dist-info/METADATA,sha256=FUiZzxQzqnOwokb7MflZCMUzK9JgFVUVzEvLBPAlpgk,4144
8
- PyamilySeq-0.2.0.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
9
- PyamilySeq-0.2.0.dist-info/entry_points.txt,sha256=zGtA2Ycf0LG3PR7zuuT0wjaAKLFxtyGgBc0O_W7E250,66
10
- PyamilySeq-0.2.0.dist-info/top_level.txt,sha256=J6JhugUQTq4rq96yibAlQu3o4KCM9WuYfqr3w1r119M,11
11
- PyamilySeq-0.2.0.dist-info/RECORD,,
@@ -1,2 +0,0 @@
1
- [console_scripts]
2
- PyamilySeq = PyamilySeq.PyamilySeq_Species:main