PyamilySeq 0.9.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyamilySeq/Cluster_Summary.py +2 -2
- PyamilySeq/Group_Splitter.py +247 -58
- PyamilySeq/PyamilySeq.py +168 -148
- PyamilySeq/PyamilySeq_Genus.py +11 -11
- PyamilySeq/PyamilySeq_Species.py +51 -29
- PyamilySeq/Seq_Combiner.py +6 -7
- PyamilySeq/Seq_Extractor.py +64 -0
- PyamilySeq/Seq_Finder.py +56 -0
- PyamilySeq/clusterings.py +139 -49
- PyamilySeq/constants.py +2 -0
- PyamilySeq/utils.py +214 -56
- {PyamilySeq-0.9.0.dist-info → PyamilySeq-1.0.1.dist-info}/METADATA +174 -138
- PyamilySeq-1.0.1.dist-info/RECORD +18 -0
- {PyamilySeq-0.9.0.dist-info → PyamilySeq-1.0.1.dist-info}/WHEEL +1 -1
- {PyamilySeq-0.9.0.dist-info → PyamilySeq-1.0.1.dist-info}/entry_points.txt +2 -0
- PyamilySeq/Constants.py +0 -2
- PyamilySeq-0.9.0.dist-info/RECORD +0 -16
- {PyamilySeq-0.9.0.dist-info → PyamilySeq-1.0.1.dist-info}/LICENSE +0 -0
- {PyamilySeq-0.9.0.dist-info → PyamilySeq-1.0.1.dist-info}/top_level.txt +0 -0
PyamilySeq/Cluster_Summary.py
CHANGED
|
@@ -3,10 +3,10 @@ from collections import OrderedDict
|
|
|
3
3
|
from collections import defaultdict
|
|
4
4
|
|
|
5
5
|
try:
|
|
6
|
-
from .
|
|
6
|
+
from .constants import *
|
|
7
7
|
from .utils import *
|
|
8
8
|
except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
|
|
9
|
-
from
|
|
9
|
+
from constants import *
|
|
10
10
|
from utils import *
|
|
11
11
|
|
|
12
12
|
|
PyamilySeq/Group_Splitter.py
CHANGED
|
@@ -1,15 +1,13 @@
|
|
|
1
|
-
|
|
2
|
-
import subprocess
|
|
3
|
-
import os
|
|
1
|
+
|
|
4
2
|
import argparse
|
|
5
3
|
from collections import defaultdict, OrderedDict
|
|
6
|
-
|
|
4
|
+
|
|
7
5
|
|
|
8
6
|
try:
|
|
9
|
-
from .
|
|
7
|
+
from .constants import *
|
|
10
8
|
from .utils import *
|
|
11
9
|
except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
|
|
12
|
-
from
|
|
10
|
+
from constants import *
|
|
13
11
|
from utils import *
|
|
14
12
|
|
|
15
13
|
def run_cd_hit(options, input_file, clustering_output, clustering_mode):
|
|
@@ -22,16 +20,16 @@ def run_cd_hit(options, input_file, clustering_output, clustering_mode):
|
|
|
22
20
|
'-T', str(options.clustering_threads),
|
|
23
21
|
'-M', str(options.clustering_memory),
|
|
24
22
|
'-d', "0",
|
|
25
|
-
'-g',
|
|
23
|
+
'-g', str(options.fast_mode),
|
|
26
24
|
'-sc', "1",
|
|
27
25
|
'-sf', "1"
|
|
28
26
|
]
|
|
29
|
-
if options.verbose:
|
|
27
|
+
if options.verbose == True:
|
|
30
28
|
subprocess.run(cdhit_command)
|
|
31
29
|
else:
|
|
32
30
|
subprocess.run(cdhit_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
33
31
|
|
|
34
|
-
@profile
|
|
32
|
+
#'@profile
|
|
35
33
|
def calculate_new_rep_seq(cluster_data, length_weight=1.0, identity_weight=1.0):
|
|
36
34
|
total_length = sum(entry['length'] for entry in cluster_data)
|
|
37
35
|
avg_length = total_length / len(cluster_data)
|
|
@@ -54,8 +52,8 @@ def calculate_new_rep_seq(cluster_data, length_weight=1.0, identity_weight=1.0):
|
|
|
54
52
|
|
|
55
53
|
|
|
56
54
|
|
|
57
|
-
def length_within_threshold(rep_length, length, len_diff):
|
|
58
|
-
return abs(rep_length - length) / rep_length <= len_diff
|
|
55
|
+
#def length_within_threshold(rep_length, length, len_diff):
|
|
56
|
+
# return abs(rep_length - length) / rep_length <= len_diff
|
|
59
57
|
|
|
60
58
|
|
|
61
59
|
def check_if_all_identical(clustered_sequences):
|
|
@@ -66,21 +64,40 @@ def check_if_all_identical(clustered_sequences):
|
|
|
66
64
|
|
|
67
65
|
|
|
68
66
|
|
|
69
|
-
def read_fasta_groups(options):
|
|
67
|
+
def read_fasta_groups(options, groups_to_use):
|
|
70
68
|
groups = defaultdict(list)
|
|
71
69
|
genome_count = defaultdict(int)
|
|
72
70
|
current_group = None
|
|
73
71
|
current_sequence = []
|
|
74
72
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
73
|
+
if options.sequence_type == 'AA':
|
|
74
|
+
affix = '_aa.fasta'
|
|
75
|
+
else:
|
|
76
|
+
affix = '_dna.fasta'
|
|
77
|
+
|
|
78
|
+
combined_groups_fasta = options.input_directory + '/Gene_Groups_Output/combined_group_sequences' + affix
|
|
79
|
+
|
|
80
|
+
if groups_to_use[0] == 'ids':
|
|
81
|
+
selected_group_ids = [int(g.strip()) for g in groups_to_use[1].split(',')]
|
|
82
|
+
elif groups_to_use[0] == 'groups':
|
|
83
|
+
selected_groups = set(range(int(groups_to_use[1]), 101))
|
|
84
|
+
# Scan the directory for filenames that match the criteria
|
|
85
|
+
selected_group_ids = []
|
|
86
|
+
for filename in os.listdir(os.path.dirname(combined_groups_fasta)):
|
|
87
|
+
if 'core' in filename and filename.endswith('.fasta'):
|
|
88
|
+
try:
|
|
89
|
+
group_number = int(filename.split('_')[2])
|
|
90
|
+
if group_number in selected_groups:
|
|
91
|
+
selected_group_ids.append(int(filename.split('_')[3].split('.')[0]))
|
|
92
|
+
except ValueError:
|
|
93
|
+
continue
|
|
94
|
+
|
|
79
95
|
|
|
80
|
-
|
|
96
|
+
group_number = None
|
|
97
|
+
with open(combined_groups_fasta, 'r') as f:
|
|
81
98
|
for line in f:
|
|
82
99
|
if line.startswith('>'):
|
|
83
|
-
if current_group is not None and (
|
|
100
|
+
if current_group is not None and (selected_group_ids is None or group_number in selected_group_ids):
|
|
84
101
|
groups[current_group].append((current_group_header, ''.join(current_sequence)))
|
|
85
102
|
|
|
86
103
|
current_group_header = line.strip()
|
|
@@ -91,7 +108,7 @@ def read_fasta_groups(options):
|
|
|
91
108
|
|
|
92
109
|
# Only process if group matches the selected_groups or if no specific groups were provided
|
|
93
110
|
group_number = int(current_group.replace('>Group_', '')) # Assuming format 'Group_n'
|
|
94
|
-
if
|
|
111
|
+
if selected_group_ids is not None and group_number not in selected_group_ids:
|
|
95
112
|
current_group = None # Skip this group
|
|
96
113
|
continue
|
|
97
114
|
|
|
@@ -101,9 +118,12 @@ def read_fasta_groups(options):
|
|
|
101
118
|
if current_group is not None:
|
|
102
119
|
groups[current_group].append((current_group_header, ''.join(current_sequence)))
|
|
103
120
|
|
|
121
|
+
|
|
104
122
|
return groups, genome_count
|
|
105
123
|
|
|
106
124
|
|
|
125
|
+
|
|
126
|
+
|
|
107
127
|
def write_fasta(sequences, output_file):
|
|
108
128
|
with open(output_file, 'w') as f:
|
|
109
129
|
for header, seq in sequences:
|
|
@@ -145,14 +165,14 @@ def read_cd_hit_output(clustering_output):
|
|
|
145
165
|
|
|
146
166
|
return clusters
|
|
147
167
|
|
|
148
|
-
|
|
149
|
-
def separate_groups(options, clustering_mode):
|
|
150
|
-
groups, genome_count = read_fasta_groups(options)
|
|
168
|
+
#@profile
|
|
169
|
+
def separate_groups(options, clustering_mode, groups_to_use):
|
|
170
|
+
groups, genome_count = read_fasta_groups(options, groups_to_use)
|
|
151
171
|
|
|
152
|
-
paralog_groups = defaultdict(
|
|
172
|
+
paralog_groups = defaultdict(lambda: {'count': 0, 'sizes': []}) # To track number of paralog groups and their sizes
|
|
153
173
|
|
|
154
174
|
for group_header, sequences in groups.items():
|
|
155
|
-
if options.verbose:
|
|
175
|
+
if options.verbose == True:
|
|
156
176
|
print(f"\n###\nCurrent Group: {group_header.replace('>','')}\n")
|
|
157
177
|
|
|
158
178
|
group_name = group_header.split('|')[0] # Get the group part (e.g., '>Group_n')
|
|
@@ -166,18 +186,18 @@ def separate_groups(options, clustering_mode):
|
|
|
166
186
|
num_genomes_with_multiple_genes = sum(1 for count in genome_to_gene_count.values() if count > 1)
|
|
167
187
|
|
|
168
188
|
# Check if the group meets the threshold for having paralogs
|
|
169
|
-
if options.groups == None:
|
|
170
|
-
|
|
171
|
-
|
|
189
|
+
#if options.groups == None:
|
|
190
|
+
if (num_genomes_with_multiple_genes / options.genome_num) * 100 < options.group_threshold:
|
|
191
|
+
continue
|
|
172
192
|
|
|
173
193
|
|
|
174
194
|
group_file_name = group_name.replace('>','')
|
|
175
195
|
|
|
176
|
-
temp_fasta = f"{options.
|
|
196
|
+
temp_fasta = f"{options.gene_groups_output}/{group_file_name}.fasta"
|
|
177
197
|
write_fasta(sequences, temp_fasta)
|
|
178
198
|
|
|
179
199
|
# Run cd-hit on the individual group
|
|
180
|
-
clustering_output = f"{options.
|
|
200
|
+
clustering_output = f"{options.gene_groups_output}/{group_file_name}_clustering"
|
|
181
201
|
|
|
182
202
|
run_cd_hit(options, temp_fasta, clustering_output, clustering_mode)
|
|
183
203
|
|
|
@@ -255,7 +275,7 @@ def separate_groups(options, clustering_mode):
|
|
|
255
275
|
|
|
256
276
|
# Write each subgroup into a separate FASTA file
|
|
257
277
|
if subgroup_sequences:
|
|
258
|
-
subgroup_file = f"{options.
|
|
278
|
+
subgroup_file = f"{options.sub_groups_output}/{group_file_name}_subgroup_{subgroup_id}.fasta"
|
|
259
279
|
write_fasta(subgroup_sequences, subgroup_file)
|
|
260
280
|
|
|
261
281
|
# Remove processed sequences from the remaining list
|
|
@@ -264,7 +284,8 @@ def separate_groups(options, clustering_mode):
|
|
|
264
284
|
|
|
265
285
|
# Increment subgroup ID for the next subgroup
|
|
266
286
|
subgroup_id += 1
|
|
267
|
-
paralog_groups[group_name] += 1 # Count this group as a paralog group
|
|
287
|
+
paralog_groups[group_name]['count'] += 1 # Count this group as a paralog group
|
|
288
|
+
paralog_groups[group_name]['sizes'].append(len(subgroup_sequences)) # Record the size of the subgroup
|
|
268
289
|
|
|
269
290
|
|
|
270
291
|
|
|
@@ -289,12 +310,13 @@ def separate_groups(options, clustering_mode):
|
|
|
289
310
|
|
|
290
311
|
# Write out each subgroup to a separate FASTA file
|
|
291
312
|
for subgroup_id, seqs in subgroup_sequences.items():
|
|
292
|
-
subgroup_file = f"{options.
|
|
313
|
+
subgroup_file = f"{options.input_directory}/{group_file_name}_subgroup_{subgroup_id}.fasta"
|
|
293
314
|
write_fasta(seqs, subgroup_file)
|
|
294
315
|
|
|
295
316
|
# Increment subgroup ID globally for the next subgroup
|
|
296
317
|
subgroup_id += 1
|
|
297
|
-
paralog_groups[group_name] += 1 # Count this group as a paralog group
|
|
318
|
+
paralog_groups[group_name]['count'] += 1 # Count this group as a paralog group
|
|
319
|
+
paralog_groups[group_name]['sizes'].append(len(seqs)) # Record the size of the subgroup
|
|
298
320
|
|
|
299
321
|
|
|
300
322
|
|
|
@@ -307,53 +329,73 @@ def separate_groups(options, clustering_mode):
|
|
|
307
329
|
if os.path.exists(clustering_output):
|
|
308
330
|
os.remove(clustering_output)
|
|
309
331
|
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
for group_id, count in paralog_groups.items():
|
|
313
|
-
print(f"Group ID: {group_id}, Number of new groups: {count}")
|
|
332
|
+
|
|
333
|
+
return paralog_groups
|
|
314
334
|
|
|
315
335
|
|
|
316
336
|
def main():
|
|
317
337
|
parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': Group-Splitter - A tool to split multi-copy gene groups identified by PyamilySeq.')
|
|
318
338
|
### Required Arguments
|
|
319
339
|
required = parser.add_argument_group('Required Parameters')
|
|
320
|
-
required.add_argument('-
|
|
321
|
-
help='
|
|
340
|
+
required.add_argument('-input_directory', action='store', dest='input_directory',
|
|
341
|
+
help='Provide the directory of a PyamilySeq run.',
|
|
322
342
|
required=True)
|
|
323
|
-
required.add_argument('-sequence_type', action='store', dest='sequence_type', default='
|
|
324
|
-
help='Default -
|
|
343
|
+
required.add_argument('-sequence_type', action='store', dest='sequence_type', default='AA',choices=['AA', 'DNA'],
|
|
344
|
+
help='Default - AA: Are groups "DNA" or "AA" sequences?',
|
|
325
345
|
required=True)
|
|
326
346
|
required.add_argument('-genome_num', action='store', dest='genome_num', type=int,
|
|
327
347
|
help='The total number of genomes must be provide',
|
|
328
348
|
required=True)
|
|
329
|
-
required.add_argument('-output_dir', action='store', dest='output_dir',
|
|
330
|
-
help='Output directory.',
|
|
331
|
-
required=True)
|
|
332
349
|
|
|
350
|
+
|
|
351
|
+
### Regrouping Arguments
|
|
333
352
|
regrouping_params = parser.add_argument_group('Regrouping Parameters')
|
|
334
|
-
regrouping_params.add_argument('-groups', action="store", dest='groups', default=None,
|
|
335
|
-
help='Default -
|
|
336
|
-
'Provide "-groups
|
|
353
|
+
regrouping_params.add_argument('-groups', action="store", dest='groups', type=int, default=None,
|
|
354
|
+
help='Default - 99: groups to be split by pangenome grouping (see -group_threshold). '
|
|
355
|
+
'Provide "-groups 99" to split specific groups.',
|
|
356
|
+
required=False)
|
|
357
|
+
regrouping_params.add_argument('-group_ids', action="store", dest='group_ids', default=None,
|
|
358
|
+
help='Default - None: Provide "-group_ids 1,2,3,4" to split specific groups (see -group_threshold).',
|
|
337
359
|
required=False)
|
|
338
360
|
regrouping_params.add_argument('-group_threshold', action='store', dest='group_threshold', type=float, default=80,
|
|
339
|
-
help='Minimum percentage of genomes with multi-copy
|
|
361
|
+
help='Default: 80: Minimum percentage of genomes with multi-copy in a gene group to be split.',
|
|
362
|
+
required=False)
|
|
363
|
+
|
|
364
|
+
### Output Arguments
|
|
365
|
+
output_args = parser.add_argument_group('Output Parameters')
|
|
366
|
+
output_args.add_argument('-a', action="store", dest='align_core', default=None,
|
|
367
|
+
help='Default - No output: SLOW! (Only works for Species mode) Output aligned and concatinated sequences of identified groups -'
|
|
368
|
+
'provide group levels at which to output "-a 99,95".',
|
|
369
|
+
required=False)
|
|
370
|
+
|
|
340
371
|
|
|
372
|
+
### CD-HIT Reclustering Arguments
|
|
341
373
|
cdhit_params = parser.add_argument_group('CD-HIT Reclustering Parameters')
|
|
342
374
|
cdhit_params.add_argument('-c', action='store', dest='pident', type=float, default=0.8,
|
|
343
375
|
help='Sequence identity threshold (default: 0.8) - Probably should be higher than what was used in initial clustering.')
|
|
344
376
|
cdhit_params.add_argument('-s', action='store', dest='len_diff', type=float, default=0.20,
|
|
345
377
|
help="Length difference cutoff (default: 0.20) - Often the most impactful parameter to split 'multi-copy' gene groups.")
|
|
346
|
-
cdhit_params.add_argument('-
|
|
347
|
-
|
|
378
|
+
cdhit_params.add_argument('-fastmode', action='store_true', dest='fast_mode',
|
|
379
|
+
help='Default False: Run CD-HIT with "-g 0" to speed up but reduce accuracy of clustering.',
|
|
380
|
+
required=False)
|
|
381
|
+
cdhit_params.add_argument('-T', action='store', dest='clustering_threads', type=int, default=8,
|
|
382
|
+
help='Number of threads for clustering (default: 8)')
|
|
348
383
|
cdhit_params.add_argument('-M', action='store', dest='clustering_memory', type=int, default=2000,
|
|
349
384
|
help='Memory limit in MB for clustering (default: 2000)')
|
|
350
385
|
|
|
386
|
+
### MAFFT Alignment Arguments
|
|
387
|
+
alignment_args = parser.add_argument_group('Alignment Runtime Arguments - Optional when "-a" is provided.')
|
|
388
|
+
|
|
389
|
+
alignment_args.add_argument("-t", action="store", dest="threads", type=int, default=8,
|
|
390
|
+
help="Default 8: Threads to be allocated for clustering and/or alignment.",
|
|
391
|
+
required=False)
|
|
351
392
|
|
|
393
|
+
### Misc Arguments
|
|
352
394
|
misc = parser.add_argument_group("Misc Parameters")
|
|
353
395
|
misc.add_argument('-no_delete_temp_files', action='store_false', dest='delete_temp_files',
|
|
354
396
|
help='Default: Delete all temporary files after processing.',
|
|
355
397
|
required=False)
|
|
356
|
-
misc.add_argument("-verbose", action="store_true", dest="verbose"
|
|
398
|
+
misc.add_argument("-verbose", action="store_true", dest="verbose",
|
|
357
399
|
help="Print verbose output.",
|
|
358
400
|
required=False)
|
|
359
401
|
misc.add_argument("-v", "--version", action="version",
|
|
@@ -366,15 +408,162 @@ def main():
|
|
|
366
408
|
|
|
367
409
|
|
|
368
410
|
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
411
|
+
###External tool checks:
|
|
412
|
+
##MAFFT
|
|
413
|
+
if options.align_core == True:
|
|
414
|
+
if is_tool_installed('mafft'):
|
|
415
|
+
if options.verbose == True:
|
|
416
|
+
print("mafft is installed. Proceeding with alignment.")
|
|
417
|
+
else:
|
|
418
|
+
exit("mafft is not installed. Please install mafft to proceed.")
|
|
419
|
+
##CD-HIT
|
|
420
|
+
|
|
421
|
+
if is_tool_installed('cd-hit'):
|
|
422
|
+
if options.verbose == True:
|
|
423
|
+
print("cd-hit is installed. Proceeding with clustering.")
|
|
424
|
+
if options.sequence_type == 'DNA':
|
|
425
|
+
clustering_mode = 'cd-hit-est'
|
|
426
|
+
else:
|
|
427
|
+
clustering_mode = 'cd-hit'
|
|
428
|
+
if options.fast_mode == True:
|
|
429
|
+
options.fast_mode = 0
|
|
430
|
+
if options.verbose == True:
|
|
431
|
+
print("Running CD-HIT in fast mode.")
|
|
432
|
+
else:
|
|
433
|
+
options.fast_mode = 1
|
|
434
|
+
if options.verbose == True:
|
|
435
|
+
print("Running CD-HIT in slow mode.")
|
|
374
436
|
else:
|
|
375
|
-
|
|
437
|
+
exit("cd-hit is not installed. Please install cd-hit to proceed.")
|
|
438
|
+
|
|
439
|
+
##Alignment
|
|
440
|
+
if options.align_core != None:
|
|
441
|
+
if options.groups == None and options.group_ids == None:
|
|
442
|
+
sys.exit('Must provide "-groups" or "-group_ids" when requesting alignment with "-a".')
|
|
443
|
+
|
|
444
|
+
##Output Directories
|
|
445
|
+
gene_groups_output = os.path.join(options.input_directory, "Gene_Groups_Output")
|
|
446
|
+
options.gene_groups_output = gene_groups_output
|
|
447
|
+
sub_groups_output = os.path.join(options.input_directory, "Sub_Groups_Output")
|
|
448
|
+
options.sub_groups_output = sub_groups_output
|
|
449
|
+
if not os.path.exists(gene_groups_output):
|
|
450
|
+
os.makedirs(gene_groups_output)
|
|
451
|
+
if not os.path.exists(sub_groups_output):
|
|
452
|
+
os.makedirs(sub_groups_output)
|
|
453
|
+
|
|
454
|
+
## Get Summary Stats
|
|
455
|
+
summary_file = os.path.join(options.input_directory, 'summary_statistics.txt')
|
|
456
|
+
|
|
457
|
+
# Save arguments to a text file
|
|
458
|
+
params_out = os.path.join(options.input_directory, 'Group-Splitter_params.txt')
|
|
459
|
+
with open(params_out, "w") as outfile:
|
|
460
|
+
for arg, value in vars(options).items():
|
|
461
|
+
outfile.write(f"{arg}: {value}\n")
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
## Group Selction - FIX THIS - currently fails if either are not provided
|
|
466
|
+
if options.groups != None and options.group_ids != None:
|
|
467
|
+
sys.exit('Must provide "-group_ids" or "-groups", not both.')
|
|
468
|
+
elif options.group_ids != None:
|
|
469
|
+
groups_to_use = ('ids', options.group_ids)
|
|
470
|
+
elif options.groups != None:
|
|
471
|
+
groups_to_use = ('groups', options.groups)
|
|
472
|
+
else:
|
|
473
|
+
groups_to_use = ('groups', 99)
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
paralog_groups = separate_groups(options, clustering_mode, groups_to_use)
|
|
478
|
+
###
|
|
479
|
+
# Print metrics about paralog groups
|
|
480
|
+
print(f"Identified {len(paralog_groups)} paralog groups:")
|
|
481
|
+
for group_id, data in paralog_groups.items():
|
|
482
|
+
print(f"Group ID: {group_id}, Number of new groups: {data['count']}, Sizes: {data['sizes']}")
|
|
483
|
+
###
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
# Read summary statistics
|
|
487
|
+
with open(summary_file, 'r') as f:
|
|
488
|
+
summary_data = f.read().splitlines()
|
|
489
|
+
|
|
490
|
+
summary_info = {}
|
|
491
|
+
for line in summary_data:
|
|
492
|
+
if ':' in line:
|
|
493
|
+
key, value = line.split(':')
|
|
494
|
+
summary_info[key.strip()] = int(value.strip())
|
|
495
|
+
|
|
496
|
+
genome_num = summary_info['Number of Genomes']
|
|
497
|
+
core_99 = summary_info['First_core_99']
|
|
498
|
+
core_95 = summary_info['First_core_95']
|
|
499
|
+
core_15 = summary_info['First_core_15']
|
|
500
|
+
core_0 = summary_info['First_core_0']
|
|
501
|
+
total_gene_groups = summary_info['Total Number of First Gene Groups (Including Singletons)']
|
|
502
|
+
|
|
503
|
+
# Initialise new core values
|
|
504
|
+
new_core_99 = core_99
|
|
505
|
+
new_core_95 = core_95
|
|
506
|
+
new_core_15 = core_15
|
|
507
|
+
new_core_0 = core_0
|
|
508
|
+
|
|
509
|
+
# Recalculate each *_core_* value
|
|
510
|
+
for group_id, data in paralog_groups.items():
|
|
511
|
+
group_id = group_id.replace('>Group_', '')
|
|
512
|
+
original_group = next((f for f in os.listdir(gene_groups_output) if f.endswith(f'_{group_id}.fasta')), None)
|
|
513
|
+
original_group = int(original_group.split('_')[2])
|
|
514
|
+
if original_group == 99:
|
|
515
|
+
new_core_99 -= 1
|
|
516
|
+
elif original_group == 95:
|
|
517
|
+
new_core_95 -= 1
|
|
518
|
+
elif original_group == 15:
|
|
519
|
+
new_core_15 -= 1
|
|
520
|
+
elif original_group == 0:
|
|
521
|
+
new_core_0 -= 1
|
|
522
|
+
|
|
523
|
+
for size in data['sizes']:
|
|
524
|
+
if size >= math.floor(99 * genome_num / 100):
|
|
525
|
+
new_core_99 += 1
|
|
526
|
+
elif size >= math.floor(95 * genome_num / 100):
|
|
527
|
+
new_core_95 += 1
|
|
528
|
+
elif size >= math.floor(15 * genome_num / 100):
|
|
529
|
+
new_core_15 += 1
|
|
530
|
+
elif size >= math.floor(0 * genome_num / 100):
|
|
531
|
+
new_core_0 += 1
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
# Write out the new summary statistics - currently only works for default cores
|
|
537
|
+
stats_out = summary_file.replace('.txt','_recalculated.txt')
|
|
538
|
+
key_order = ['First_core_', 'extended_core_', 'combined_core_', 'Second_core_','only_Second_core_']
|
|
539
|
+
with open(stats_out, 'w') as outfile:
|
|
540
|
+
print("Number of Genomes: " + str(options.genome_num))
|
|
541
|
+
outfile.write("Number of Genomes: " + str(options.genome_num) + "\n")
|
|
542
|
+
print("Reclaculated Gene Groups:")
|
|
543
|
+
outfile.write("Recalculated Gene Groups\n")
|
|
544
|
+
print(f"First_core_99: {new_core_99}")
|
|
545
|
+
outfile.write(f"First_core_99: {new_core_99}\n")
|
|
546
|
+
print(f"First_core_95: {new_core_95}")
|
|
547
|
+
outfile.write(f"First_core_95: {new_core_95}\n")
|
|
548
|
+
print(f"First_core_15: {new_core_15}")
|
|
549
|
+
outfile.write(f"First_core_15: {new_core_15}\n")
|
|
550
|
+
print(f"First_core_0: {new_core_0}")
|
|
551
|
+
outfile.write(f"First_core_0: {new_core_0}\n")
|
|
552
|
+
print("Total Number of First Gene Groups (Including Singletons): " + str(total_gene_groups))
|
|
553
|
+
outfile.write("Total Number of First Gene Groups (Including Singletons): " + str(total_gene_groups))
|
|
554
|
+
|
|
555
|
+
# Alignment
|
|
556
|
+
if options.align_core != None:
|
|
557
|
+
print("\n\nProcessing gene group alignment")
|
|
558
|
+
group_directory = options.gene_groups_output
|
|
559
|
+
sub_group_directory = options.sub_groups_output
|
|
560
|
+
genome_list = read_genomes_from_fasta(options.gene_groups_output + '/combined_group_sequences_dna.fasta')
|
|
561
|
+
process_gene_groups(options, group_directory, sub_group_directory, paralog_groups, genome_list, 'concatenated_genes_post_splitting_aligned_dna.fasta')
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
|
|
565
|
+
|
|
376
566
|
|
|
377
|
-
separate_groups(options, clustering_mode)
|
|
378
567
|
|
|
379
568
|
|
|
380
569
|
if __name__ == "__main__":
|