PyamilySeq 0.5.2__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. {pyamilyseq-0.5.2/src/PyamilySeq.egg-info → pyamilyseq-0.6.0}/PKG-INFO +13 -10
  2. {pyamilyseq-0.5.2 → pyamilyseq-0.6.0}/README.md +12 -9
  3. {pyamilyseq-0.5.2 → pyamilyseq-0.6.0}/setup.cfg +1 -1
  4. pyamilyseq-0.6.0/src/PyamilySeq/Constants.py +2 -0
  5. {pyamilyseq-0.5.2 → pyamilyseq-0.6.0}/src/PyamilySeq/PyamilySeq.py +33 -17
  6. pyamilyseq-0.6.0/src/PyamilySeq/PyamilySeq_Genus.py +259 -0
  7. pyamilyseq-0.6.0/src/PyamilySeq/PyamilySeq_Species.py +279 -0
  8. pyamilyseq-0.6.0/src/PyamilySeq/clusterings.py +324 -0
  9. {pyamilyseq-0.5.2 → pyamilyseq-0.6.0}/src/PyamilySeq/utils.py +84 -1
  10. {pyamilyseq-0.5.2 → pyamilyseq-0.6.0/src/PyamilySeq.egg-info}/PKG-INFO +13 -10
  11. {pyamilyseq-0.5.2 → pyamilyseq-0.6.0}/src/PyamilySeq.egg-info/SOURCES.txt +1 -0
  12. pyamilyseq-0.5.2/src/PyamilySeq/Constants.py +0 -2
  13. pyamilyseq-0.5.2/src/PyamilySeq/PyamilySeq_Genus.py +0 -659
  14. pyamilyseq-0.5.2/src/PyamilySeq/PyamilySeq_Species.py +0 -730
  15. {pyamilyseq-0.5.2 → pyamilyseq-0.6.0}/LICENSE +0 -0
  16. {pyamilyseq-0.5.2 → pyamilyseq-0.6.0}/pyproject.toml +0 -0
  17. {pyamilyseq-0.5.2 → pyamilyseq-0.6.0}/src/PyamilySeq/CD-Hit_StORF-Reporter_Cross-Genera_Builder.py +0 -0
  18. {pyamilyseq-0.5.2 → pyamilyseq-0.6.0}/src/PyamilySeq/Seq_Combiner.py +0 -0
  19. {pyamilyseq-0.5.2 → pyamilyseq-0.6.0}/src/PyamilySeq/__init__.py +0 -0
  20. {pyamilyseq-0.5.2 → pyamilyseq-0.6.0}/src/PyamilySeq.egg-info/dependency_links.txt +0 -0
  21. {pyamilyseq-0.5.2 → pyamilyseq-0.6.0}/src/PyamilySeq.egg-info/entry_points.txt +0 -0
  22. {pyamilyseq-0.5.2 → pyamilyseq-0.6.0}/src/PyamilySeq.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PyamilySeq
3
- Version: 0.5.2
3
+ Version: 0.6.0
4
4
  Summary: PyamilySeq - A a tool to look for sequence-based gene families identified by clustering methods such as CD-HIT, DIAMOND, BLAST or MMseqs2.
5
5
  Home-page: https://github.com/NickJD/PyamilySeq
6
6
  Author: Nicholas Dimonaco
@@ -57,10 +57,11 @@ Total Number of Gene Groups (Including Singletons): 11128
57
57
 
58
58
  ## Usage - Menu
59
59
  ```
60
- usage: PyamilySeq.py [-h] -run_mode {Full,Partial} -group_mode {Species} -clust_tool {CD-HIT} -output_dir OUTPUT_DIR [-input_type {separate,combined}] [-input_dir INPUT_DIR] [-name_split NAME_SPLIT] [-pid PIDENT] [-len_diff LEN_DIFF] [-mem CLUSTERING_MEMORY] [-t CLUSTERING_THREADS] [-cluster_file CLUSTER_FILE]
61
- [-reclustered RECLUSTERED] [-seq_tag SEQUENCE_TAG] [-groups CORE_GROUPS] [-w WRITE_FAMILIES] [-con CON_CORE] [-original_fasta ORIGINAL_FASTA] [-gpa GENE_PRESENCE_ABSENCE_OUT] [-verbose {True,False}] [-v]
60
+ usage: PyamilySeq.py [-h] -run_mode {Full,Partial} -group_mode {Species,Genus} -clust_tool {CD-HIT} -output_dir OUTPUT_DIR [-input_type {separate,combined}] [-input_dir INPUT_DIR] [-name_split NAME_SPLIT]
61
+ [-pid PIDENT] [-len_diff LEN_DIFF] [-mem CLUSTERING_MEMORY] [-t CLUSTERING_THREADS] [-cluster_file CLUSTER_FILE] [-reclustered RECLUSTERED] [-seq_tag SEQUENCE_TAG]
62
+ [-core_groups CORE_GROUPS] [-genus_groups GENUS_GROUPS] [-w WRITE_FAMILIES] [-con CON_CORE] [-original_fasta ORIGINAL_FASTA] [-gpa GENE_PRESENCE_ABSENCE_OUT] [-verbose {True,False}] [-v]
62
63
 
63
- PyamilySeq v0.5.2: PyamilySeq Run Parameters.
64
+ PyamilySeq v0.6.0: PyamilySeq Run Parameters.
64
65
 
65
66
  options:
66
67
  -h, --help show this help message and exit
@@ -68,8 +69,8 @@ options:
68
69
  Required Arguments:
69
70
  -run_mode {Full,Partial}
70
71
  Run Mode: Should PyamilySeq be run in "Full" or "Partial" mode?
71
- -group_mode {Species}
72
- Group Mode: Should PyamilySeq be run in "Species" or "Genus" mode? - Genus mode not currently functioning
72
+ -group_mode {Species,Genus}
73
+ Group Mode: Should PyamilySeq be run in "Species" or "Genus" mode?
73
74
  -clust_tool {CD-HIT} Clustering tool to use: CD-HIT, DIAMOND, BLAST or MMseqs2.
74
75
  -output_dir OUTPUT_DIR
75
76
  Directory for all output files.
@@ -95,10 +96,13 @@ Partial-Mode Arguments - Required when "-run_mode Partial" is used:
95
96
 
96
97
  Grouping Arguments - Use to fine-tune grouping of genes after clustering:
97
98
  -reclustered RECLUSTERED
98
- Clustering output file from secondary round of clustering
99
+ Currently only works on Partial Mode: Clustering output file from secondary round of clustering.
99
100
  -seq_tag SEQUENCE_TAG
100
101
  Default - "StORF": Unique identifier to be used to distinguish the second of two rounds of clustered sequences
101
- -groups CORE_GROUPS Default - ('99,95,15'): Gene family groups to use
102
+ -core_groups CORE_GROUPS
103
+ Default - ('99,95,15'): Gene family groups to use for "Species" mode
104
+ -genus_groups GENUS_GROUPS
105
+ Default - ('1,2,3,4,5,6'): Gene family groups to use for "Genus" mode
102
106
 
103
107
  Output Parameters:
104
108
  -w WRITE_FAMILIES Default - No output: Output sequences of identified families (provide levels at which to output "-w 99,95" - Must provide FASTA file with -fasta
@@ -113,7 +117,6 @@ Misc:
113
117
  Default - False: Print out runtime messages
114
118
  -v Default - False: Print out version number and exit
115
119
 
116
-
117
120
  ```
118
121
 
119
122
 
@@ -126,7 +129,7 @@ Seq-Combiner -input_dir .../test_data/genomes -name_split _combined.gff3 -output
126
129
  ```bash
127
130
  usage: Seq_Combiner.py [-h] -input_dir INPUT_DIR -input_type {separate,combined} -name_split NAME_SPLIT -output_dir OUTPUT_DIR -output_name OUTPUT_FILE
128
131
 
129
- Seq-Combiner v0.5.2: Seq-Combiner Run Parameters.
132
+ Seq-Combiner v0.6.0: Seq-Combiner Run Parameters.
130
133
 
131
134
  options:
132
135
  -h, --help show this help message and exit
@@ -42,10 +42,11 @@ Total Number of Gene Groups (Including Singletons): 11128
42
42
 
43
43
  ## Usage - Menu
44
44
  ```
45
- usage: PyamilySeq.py [-h] -run_mode {Full,Partial} -group_mode {Species} -clust_tool {CD-HIT} -output_dir OUTPUT_DIR [-input_type {separate,combined}] [-input_dir INPUT_DIR] [-name_split NAME_SPLIT] [-pid PIDENT] [-len_diff LEN_DIFF] [-mem CLUSTERING_MEMORY] [-t CLUSTERING_THREADS] [-cluster_file CLUSTER_FILE]
46
- [-reclustered RECLUSTERED] [-seq_tag SEQUENCE_TAG] [-groups CORE_GROUPS] [-w WRITE_FAMILIES] [-con CON_CORE] [-original_fasta ORIGINAL_FASTA] [-gpa GENE_PRESENCE_ABSENCE_OUT] [-verbose {True,False}] [-v]
45
+ usage: PyamilySeq.py [-h] -run_mode {Full,Partial} -group_mode {Species,Genus} -clust_tool {CD-HIT} -output_dir OUTPUT_DIR [-input_type {separate,combined}] [-input_dir INPUT_DIR] [-name_split NAME_SPLIT]
46
+ [-pid PIDENT] [-len_diff LEN_DIFF] [-mem CLUSTERING_MEMORY] [-t CLUSTERING_THREADS] [-cluster_file CLUSTER_FILE] [-reclustered RECLUSTERED] [-seq_tag SEQUENCE_TAG]
47
+ [-core_groups CORE_GROUPS] [-genus_groups GENUS_GROUPS] [-w WRITE_FAMILIES] [-con CON_CORE] [-original_fasta ORIGINAL_FASTA] [-gpa GENE_PRESENCE_ABSENCE_OUT] [-verbose {True,False}] [-v]
47
48
 
48
- PyamilySeq v0.5.2: PyamilySeq Run Parameters.
49
+ PyamilySeq v0.6.0: PyamilySeq Run Parameters.
49
50
 
50
51
  options:
51
52
  -h, --help show this help message and exit
@@ -53,8 +54,8 @@ options:
53
54
  Required Arguments:
54
55
  -run_mode {Full,Partial}
55
56
  Run Mode: Should PyamilySeq be run in "Full" or "Partial" mode?
56
- -group_mode {Species}
57
- Group Mode: Should PyamilySeq be run in "Species" or "Genus" mode? - Genus mode not currently functioning
57
+ -group_mode {Species,Genus}
58
+ Group Mode: Should PyamilySeq be run in "Species" or "Genus" mode?
58
59
  -clust_tool {CD-HIT} Clustering tool to use: CD-HIT, DIAMOND, BLAST or MMseqs2.
59
60
  -output_dir OUTPUT_DIR
60
61
  Directory for all output files.
@@ -80,10 +81,13 @@ Partial-Mode Arguments - Required when "-run_mode Partial" is used:
80
81
 
81
82
  Grouping Arguments - Use to fine-tune grouping of genes after clustering:
82
83
  -reclustered RECLUSTERED
83
- Clustering output file from secondary round of clustering
84
+ Currently only works on Partial Mode: Clustering output file from secondary round of clustering.
84
85
  -seq_tag SEQUENCE_TAG
85
86
  Default - "StORF": Unique identifier to be used to distinguish the second of two rounds of clustered sequences
86
- -groups CORE_GROUPS Default - ('99,95,15'): Gene family groups to use
87
+ -core_groups CORE_GROUPS
88
+ Default - ('99,95,15'): Gene family groups to use for "Species" mode
89
+ -genus_groups GENUS_GROUPS
90
+ Default - ('1,2,3,4,5,6'): Gene family groups to use for "Genus" mode
87
91
 
88
92
  Output Parameters:
89
93
  -w WRITE_FAMILIES Default - No output: Output sequences of identified families (provide levels at which to output "-w 99,95" - Must provide FASTA file with -fasta
@@ -98,7 +102,6 @@ Misc:
98
102
  Default - False: Print out runtime messages
99
103
  -v Default - False: Print out version number and exit
100
104
 
101
-
102
105
  ```
103
106
 
104
107
 
@@ -111,7 +114,7 @@ Seq-Combiner -input_dir .../test_data/genomes -name_split _combined.gff3 -output
111
114
  ```bash
112
115
  usage: Seq_Combiner.py [-h] -input_dir INPUT_DIR -input_type {separate,combined} -name_split NAME_SPLIT -output_dir OUTPUT_DIR -output_name OUTPUT_FILE
113
116
 
114
- Seq-Combiner v0.5.2: Seq-Combiner Run Parameters.
117
+ Seq-Combiner v0.6.0: Seq-Combiner Run Parameters.
115
118
 
116
119
  options:
117
120
  -h, --help show this help message and exit
@@ -1,6 +1,6 @@
1
1
  [metadata]
2
2
  name = PyamilySeq
3
- version = v0.5.2
3
+ version = v0.6.0
4
4
  author = Nicholas Dimonaco
5
5
  author_email = nicholas@dimonaco.co.uk
6
6
  description = PyamilySeq - A a tool to look for sequence-based gene families identified by clustering methods such as CD-HIT, DIAMOND, BLAST or MMseqs2.
@@ -0,0 +1,2 @@
1
+ PyamilySeq_Version = 'v0.6.0'
2
+
@@ -7,11 +7,13 @@ import subprocess
7
7
 
8
8
 
9
9
  try:
10
- from .PyamilySeq_Species import cluster
10
+ from .PyamilySeq_Species import cluster as species_cluster
11
+ from .PyamilySeq_Genus import cluster as genus_cluster
11
12
  from .Constants import *
12
13
  from .utils import *
13
14
  except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
14
- from PyamilySeq_Species import cluster
15
+ from PyamilySeq_Species import cluster as species_cluster
16
+ from PyamilySeq_Genus import cluster as genus_cluster
15
17
  from Constants import *
16
18
  from utils import *
17
19
 
@@ -44,8 +46,8 @@ def main():
44
46
  required.add_argument('-run_mode', action='store', dest='run_mode', choices=['Full','Partial'],
45
47
  help='Run Mode: Should PyamilySeq be run in "Full" or "Partial" mode?',
46
48
  required=True)
47
- required.add_argument('-group_mode', action='store', dest='group_type', choices=['Species'],
48
- help='Group Mode: Should PyamilySeq be run in "Species" or "Genus" mode? - Genus mode not currently functioning',
49
+ required.add_argument('-group_mode', action='store', dest='group_type', choices=['Species', 'Genus'],
50
+ help='Group Mode: Should PyamilySeq be run in "Species" or "Genus" mode? ',
49
51
  required=True)
50
52
  required.add_argument("-clust_tool", action="store", dest="clust_tool", choices=['CD-HIT'],
51
53
  help="Clustering tool to use: CD-HIT, DIAMOND, BLAST or MMseqs2.",
@@ -88,13 +90,17 @@ def main():
88
90
 
89
91
  ###Grouping Arguments
90
92
  grouping_args = parser.add_argument_group('Grouping Arguments - Use to fine-tune grouping of genes after clustering')
91
- grouping_args.add_argument('-reclustered', action='store', dest='reclustered', help='Clustering output file from secondary round of clustering',
93
+ grouping_args.add_argument('-reclustered', action='store', dest='reclustered',
94
+ help='Currently only works on Partial Mode: Clustering output file from secondary round of clustering.',
92
95
  required=False)
93
96
  grouping_args.add_argument('-seq_tag', action='store', dest='sequence_tag', default='StORF',
94
97
  help='Default - "StORF": Unique identifier to be used to distinguish the second of two rounds of clustered sequences',
95
98
  required=False)
96
- grouping_args.add_argument('-groups', action="store", dest='core_groups', default="99,95,15",
97
- help='Default - (\'99,95,15\'): Gene family groups to use',
99
+ grouping_args.add_argument('-core_groups', action="store", dest='core_groups', default="99,95,15",
100
+ help='Default - (\'99,95,15\'): Gene family groups to use for "Species" mode',
101
+ required=False)
102
+ grouping_args.add_argument('-genus_groups', action="store", dest='genus_groups', default="1,2,3,4,5,6",
103
+ help='Default - (\'1,2,3,4,5,6\'): Gene family groups to use for "Genus" mode',
98
104
  required=False)
99
105
 
100
106
  ###Output Arguments
@@ -126,6 +132,8 @@ def main():
126
132
 
127
133
  ### Checking all required parameters are provided by user
128
134
  if options.run_mode == 'Full':
135
+ if options.reclustered != None:
136
+ sys.exit("Currently reclustering only works on Partial Mode.")
129
137
  required_full_mode = [options.input_type, options.input_dir, options.name_split, options.clust_tool,
130
138
  options.pident, options.len_diff]
131
139
  if all(required_full_mode):
@@ -165,7 +173,7 @@ def main():
165
173
  else:
166
174
  exit("mafft is not installed. Please install mafft to proceed.")
167
175
  ##CD-HIT
168
- if options.clust_tool == 'CD-HIT':
176
+ if options.clust_tool == 'CD-HIT' and options.run_mode == 'Full':
169
177
  if is_tool_installed('cd-hit'):
170
178
  if options.verbose == True:
171
179
  print("cd-hit is installed. Proceeding with clustering.")
@@ -175,7 +183,7 @@ def main():
175
183
  if options.write_families != None and options.original_fasta == False:
176
184
  exit("-fasta must br provided if -w is used")
177
185
 
178
- options.core_groups = options.core_groups + ',0'
186
+
179
187
 
180
188
 
181
189
  if options.cluster_file:
@@ -191,24 +199,30 @@ def main():
191
199
  combined_out_file = os.path.join(output_path, "combined_sequences.fasta")
192
200
  clustering_output = os.path.join(output_path, 'clustering_' + options.clust_tool)
193
201
 
194
-
195
- if options.run_mode == 'Full':
196
-
202
+ if options.group_type == 'Species':
203
+ options.core_groups = options.core_groups + ',0'
204
+ groups_to_use = options.core_groups
205
+ else:
206
+ options.genus_groups = options.genus_groups + ',>'
207
+ groups_to_use = options.genus_groups
197
208
 
198
209
 
210
+ if options.run_mode == 'Full':
199
211
  if options.input_type == 'separate':
200
212
  read_separate_files(options.input_dir, options.name_split, combined_out_file)
201
213
  else:
202
214
  read_combined_files(options.input_dir, options.name_split, combined_out_file)
203
215
 
204
216
  run_cd_hit(combined_out_file, clustering_output, options)
217
+
205
218
  class clustering_options:
206
219
  def __init__(self):
207
220
  self.cluster_format = options.clust_tool
208
221
  self.reclustered = options.reclustered
209
222
  self.sequence_tag = options.sequence_tag
210
- self.core_groups = '99,95,15,0'
223
+ self.core_groups = groups_to_use
211
224
  self.clusters = clustering_output + clust_affix
225
+ self.output_dir = options.output_dir
212
226
  self.gene_presence_absence_out = options.gene_presence_absence_out
213
227
  self.write_families = options.write_families
214
228
  self.con_core = options.con_core
@@ -223,8 +237,9 @@ def main():
223
237
  self.cluster_format = options.clust_tool
224
238
  self.reclustered = options.reclustered
225
239
  self.sequence_tag = options.sequence_tag
226
- self.core_groups = '99,95,15,0'
240
+ self.core_groups = groups_to_use
227
241
  self.clusters = options.cluster_file
242
+ self.output_dir = options.output_dir
228
243
  self.gene_presence_absence_out = options.gene_presence_absence_out
229
244
  self.write_families = options.write_families
230
245
  self.con_core = options.con_core
@@ -234,9 +249,10 @@ def main():
234
249
  clustering_options = clustering_options()
235
250
 
236
251
 
237
-
238
-
239
- cluster(clustering_options)
252
+ if options.group_type == 'Species':
253
+ species_cluster(clustering_options)
254
+ elif options.group_type == 'Genus':
255
+ genus_cluster((clustering_options))
240
256
 
241
257
  print("Thank you for using PyamilySeq -- A detailed user manual can be found at https://github.com/NickJD/PyamilySeq\n"
242
258
  "Please report any issues to: https://github.com/NickJD/PyamilySeq/issues\n#####")
@@ -0,0 +1,259 @@
1
+ #from line_profiler_pycharm import profile
2
+
3
+ import copy
4
+ import sys
5
+ import math
6
+ from collections import Counter
7
+
8
+
9
+ try:
10
+ from .Constants import *
11
+ from .clusterings import *
12
+ from .utils import *
13
+ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
14
+ from Constants import *
15
+ from clusterings import *
16
+ from utils import *
17
+
18
+
19
+
20
+ def process_gene_families(options, directory, output_file):
21
+ """Process each gene family file to select the longest sequence per genome and concatenate aligned sequences."""
22
+ concatenated_sequences = {}
23
+ output_file = directory.replace('Gene_Families_Output',output_file)
24
+
25
+ # Iterate over each gene family file
26
+ for gene_file in os.listdir(directory):
27
+ if gene_file.endswith('.fasta'):
28
+ gene_path = os.path.join(directory, gene_file)
29
+
30
+ # Read sequences from the gene family file
31
+ sequences = read_fasta(gene_path)
32
+
33
+ # Select the longest sequence for each genome
34
+ longest_sequences = select_longest_gene(sequences)
35
+
36
+ # Run mafft on the longest sequences
37
+ aligned_file = f"{gene_file}_aligned.fasta"
38
+ run_mafft_on_sequences(options, {seq_id: seq for seq_id, seq in longest_sequences.values()}, aligned_file)
39
+
40
+ # Read aligned sequences and concatenate them
41
+ aligned_sequences = read_fasta(aligned_file)
42
+ for genome, aligned_seq in aligned_sequences.items():
43
+ genome_name = genome.split('|')[0]
44
+ if genome_name not in concatenated_sequences:
45
+ concatenated_sequences[genome_name] = ""
46
+ concatenated_sequences[genome_name] += aligned_seq
47
+
48
+ # Clean up aligned file
49
+ os.remove(aligned_file)
50
+
51
+ # Write the concatenated sequences to the output file
52
+ with open(output_file, 'w') as out:
53
+ for genome, sequence in concatenated_sequences.items():
54
+ out.write(f">{genome}\n")
55
+ wrapped_sequence = wrap_sequence(sequence, 60)
56
+ out.write(f"{wrapped_sequence}\n")
57
+
58
+ def gene_presence_absence_output(options, genus_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
59
+ print("Outputting gene_presence_absence file")
60
+ output_dir = os.path.abspath(options.output_dir)
61
+ in_name = options.clusters.split('.')[0].split('/')[-1]
62
+ gpa_outfile = os.path.join(output_dir, in_name)
63
+ gpa_outfile = open(gpa_outfile+'_gene_presence_absence.csv','w')
64
+ gpa_outfile.write('"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","'
65
+ '"Accessory Fragment","Accessory Order with Fragment","QC","Min group size nuc","Max group size nuc","Avg group size nuc","')
66
+ gpa_outfile.write('","'.join(genus_dict.keys()))
67
+ gpa_outfile.write('"\n')
68
+ for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
69
+ average_sequences_per_genome = len(sequences) / len(pangenome_clusters_First_sorted[cluster])
70
+ gpa_outfile.write('"group_'+str(cluster)+'","","'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
71
+ '","","","","","","","","",""')
72
+
73
+
74
+ for genus in genus_dict.keys():
75
+ full_out = ''
76
+ tmp_list = []
77
+ for value in sequences:
78
+ if value.split('_')[0] == genus:
79
+ tmp_list.append(value)
80
+ if tmp_list:
81
+ full_out += ',"'+''.join(tmp_list)+'"'
82
+ else:
83
+ full_out = ',""'
84
+ gpa_outfile.write(full_out)
85
+ gpa_outfile.write('\n')
86
+
87
+ ### Below is some unfinished code
88
+ # edge_list_outfile = open(in_name+'_edge_list.csv','w')
89
+ # for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
90
+ # output = []
91
+ # for entry in sequences:
92
+ # # Split each entry at '|'
93
+ # genome, gene = entry.split('|')
94
+ # # Format the result as "gene genome"
95
+ # output.append(f"{gene}\t{genome}")
96
+ # for line in output:
97
+ # edge_list_outfile.write(line + '\n')
98
+
99
+
100
+
101
+
102
+ def get_cores(options,genus_dict):
103
+ ##Calculate core groups
104
+ groups = OrderedDict()
105
+ cores = OrderedDict()
106
+ for group in options.core_groups.split(','):
107
+ first_core_group = 'First_genera_' + group
108
+ cores[first_core_group] = []
109
+ if options.reclustered != None:
110
+ extended_core_group = 'extended_genera_' + group
111
+ cores[extended_core_group] = []
112
+ combined_core_group = 'combined_genera_' + group
113
+ cores[combined_core_group] = []
114
+ second_core_group = 'Second_genera_' + group
115
+ cores[second_core_group] = []
116
+ only_second_core_group = 'only_Second_genera_' + group
117
+ cores[only_second_core_group] = []
118
+ return cores, groups
119
+
120
+
121
+ #@profile
122
+ def calc_First_only_core(cluster, First_number, cores):
123
+ try:
124
+ cores['First_genera_'+str(First_number)].append(cluster)
125
+ except KeyError:
126
+ cores['First_genera_>'].append(cluster)
127
+ #@profile
128
+ def calc_single_First_extended_Second_only_core(cluster, First_num, cores, Second_num): # Count gene families extended with StORFs
129
+ group = First_num + Second_num
130
+ try:
131
+ cores['extended_genera_' + group].append(cluster)
132
+ except KeyError:
133
+ cores['extended_genera_>'].append(cluster)
134
+ #@profile
135
+ def calc_multi_First_extended_Second_only_core(cluster, First_num, cores, Second_num): # Count seperately those gene families extended with StORF_Reporter but combined >1 PEP
136
+ group = First_num + Second_num
137
+ try:
138
+ cores['combined_genera_' + group].append(cluster)
139
+ except KeyError:
140
+ cores['combined_genera_>' + group].append(cluster)
141
+ #@profile
142
+ def calc_Second_only_core(cluster, cores, Second_num):
143
+ try:
144
+ cores['Second_genera_' + str(Second_num)].append(cluster)
145
+ except KeyError:
146
+ cores['Second_genera_>'].append(cluster)
147
+ #@profile
148
+ def calc_only_Second_only_core(cluster, cores, Second_num): # only count the true storf onlies
149
+ try:
150
+ cores['only_Second_genera_' + str(Second_num)].append(cluster)
151
+ except:
152
+ cores['only_Second_genera_>'].append(cluster)
153
+
154
+
155
+
156
+ #@profile
157
+ def cluster(options):
158
+
159
+ if options.cluster_format == 'CD-HIT':
160
+ genus_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options, '_')
161
+ elif options.cluster_format in ['TSV','CSV']:
162
+ genus_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_EdgeList(options, '_')
163
+
164
+
165
+
166
+ if options.reclustered != None:
167
+
168
+ if options.cluster_format == 'CD-HIT':
169
+ combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second = combined_clustering_CDHIT(options, genus_dict, '_')
170
+ if options.cluster_format == ['TSV','CSV']:
171
+ combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second = combined_clustering_Edge_List(options, '_')
172
+ pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered, '_')
173
+ else:
174
+
175
+ pangenome_clusters_Type = single_clustering_counting(pangenome_clusters_First, reps)
176
+
177
+ ###
178
+ cores, groups = get_cores(options, genus_dict)
179
+ ###
180
+
181
+ Number_Of_StORF_Extending_But_Same_Genomes = 0
182
+
183
+ sorted_first_keys = sort_keys_by_values(pangenome_clusters_First, pangenome_clusters_First_sequences)
184
+ pangenome_clusters_First_sorted = reorder_dict_by_keys(pangenome_clusters_First, sorted_first_keys)
185
+ pangenome_clusters_First_sequences_sorted = reorder_dict_by_keys(pangenome_clusters_First_sequences, sorted_first_keys)
186
+ pangenome_clusters_Type_sorted = reorder_dict_by_keys(pangenome_clusters_Type, sorted_first_keys)
187
+
188
+ print("Calculating Groups")
189
+ for cluster, numbers in pangenome_clusters_Type_sorted.items():
190
+ ############################### Calculate First only
191
+ calc_First_only_core(cluster, numbers[1], cores)
192
+
193
+ if options.reclustered != None:
194
+ ############################# Calculate First and Reclustered-Second
195
+ if numbers[0] == 1 and numbers[3] >= 1: # If Seconds did not combine First reps
196
+ calc_single_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
197
+ elif numbers[0] > 1 and numbers[3] >= 1: # If unique Secondss combined multiple Firsts
198
+ calc_multi_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
199
+ elif numbers[4] >= 1:
200
+ Number_Of_StORF_Extending_But_Same_Genomes += 1
201
+
202
+ combined_pangenome_clusters_ONLY_Second_Type = defaultdict(list)
203
+ combined_pangenome_clusters_Second_Type = defaultdict(list)
204
+ for cluster, genomes in combined_pangenome_clusters_Second.items():
205
+ if cluster in not_Second_only_cluster_ids:
206
+ combined_pangenome_clusters_Second_Type[cluster] = [cluster, len(genomes)]
207
+ else:
208
+ combined_pangenome_clusters_ONLY_Second_Type[cluster] = [cluster, len(genomes)]
209
+ for cluster, data in combined_pangenome_clusters_Second_Type.items():
210
+ if data[1] >=1:
211
+ calc_Second_only_core(cluster, cores, data[1])
212
+ for cluster, data in combined_pangenome_clusters_ONLY_Second_Type.items():
213
+ if data[1] >= 1 :
214
+ calc_only_Second_only_core(cluster, cores, data[1])
215
+ ###########################
216
+ ### Output
217
+ key_order = list(cores.keys())
218
+ output_path = os.path.abspath(options.output_dir)
219
+ stats_out = os.path.join(output_path,'summary_statistics.txt')
220
+ with open(stats_out,'w') as outfile:
221
+ print("Genus Groups:")
222
+ outfile.write("Genus Groups:\n")
223
+ for key in key_order:
224
+ print(key+':\t'+str(len(cores[key])))
225
+ outfile.write(key + ':\t' + str(len(cores[key]))+'\n')
226
+ print("Total Number of Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
227
+ outfile.write("Total Number of Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
228
+
229
+ if options.gene_presence_absence_out != None:
230
+ gene_presence_absence_output(options,genus_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted)
231
+
232
+ if options.write_families != None and options.fasta != None:
233
+ sequences = read_fasta(options.fasta)
234
+ output_dir = os.path.join(output_path, 'Gene_Families_Output')
235
+
236
+ # Create output directory if it doesn't exist
237
+ if not os.path.exists(output_dir):
238
+ os.makedirs(output_dir)
239
+ for key_prefix in key_order:
240
+ for key, values in cores.items():
241
+ if any(part in options.write_families.split(',') for part in key.split('_')):
242
+ if key.startswith(key_prefix):
243
+ for value in values:
244
+ output_filename = f"{key}_{value}.fasta"
245
+ sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
246
+ # Write sequences to output file that are in the sequences dictionary
247
+ with open(os.path.join(output_dir, output_filename), 'w') as outfile:
248
+ for header in sequences_to_write:
249
+ if header in sequences:
250
+ outfile.write(f">{header}\n")
251
+ wrapped_sequence = wrap_sequence(sequences[header])
252
+ outfile.write(f"{wrapped_sequence}\n")
253
+
254
+ if options.con_core != None and options.fasta != None and options.write_families != None:
255
+ process_gene_families(options, os.path.join(output_dir, 'Gene_Families_Output'), 'concatonated_genes_aligned.fasta')
256
+
257
+
258
+
259
+