PyamilySeq 0.0.2__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyamilySeq/Constants.py +15 -1
- PyamilySeq/PyamilySeq_Species.py +257 -36
- {PyamilySeq-0.0.2.dist-info → PyamilySeq-0.3.0.dist-info}/METADATA +36 -3
- PyamilySeq-0.3.0.dist-info/RECORD +11 -0
- {PyamilySeq-0.0.2.dist-info → PyamilySeq-0.3.0.dist-info}/WHEEL +1 -1
- PyamilySeq-0.0.2.dist-info/RECORD +0 -11
- {PyamilySeq-0.0.2.dist-info → PyamilySeq-0.3.0.dist-info}/LICENSE +0 -0
- {PyamilySeq-0.0.2.dist-info → PyamilySeq-0.3.0.dist-info}/entry_points.txt +0 -0
- {PyamilySeq-0.0.2.dist-info → PyamilySeq-0.3.0.dist-info}/top_level.txt +0 -0
PyamilySeq/Constants.py
CHANGED
|
@@ -1 +1,15 @@
|
|
|
1
|
-
|
|
1
|
+
import subprocess
|
|
2
|
+
|
|
3
|
+
PyamilySeq_Version = 'v0.3.0'
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def is_tool_installed(tool_name):
|
|
8
|
+
"""Check if a tool is installed and available in PATH."""
|
|
9
|
+
try:
|
|
10
|
+
subprocess.run([tool_name, '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
|
|
11
|
+
return True
|
|
12
|
+
except subprocess.CalledProcessError:
|
|
13
|
+
return False
|
|
14
|
+
except FileNotFoundError:
|
|
15
|
+
return False
|
PyamilySeq/PyamilySeq_Species.py
CHANGED
|
@@ -6,6 +6,9 @@ import math
|
|
|
6
6
|
import sys
|
|
7
7
|
import argparse
|
|
8
8
|
import os
|
|
9
|
+
from tempfile import NamedTemporaryFile
|
|
10
|
+
|
|
11
|
+
|
|
9
12
|
|
|
10
13
|
try:
|
|
11
14
|
from .Constants import *
|
|
@@ -20,6 +23,75 @@ def sort_keys_by_values(dict1, dict2):
|
|
|
20
23
|
sorted_keys = sorted(dict1.keys(), key=lambda k: custom_sort_key(k, dict1, dict2), reverse=True)
|
|
21
24
|
return sorted_keys
|
|
22
25
|
|
|
26
|
+
def select_longest_gene(sequences):
|
|
27
|
+
"""Select the longest sequence for each genome."""
|
|
28
|
+
longest_sequences = {}
|
|
29
|
+
for seq_id, sequence in sequences.items():
|
|
30
|
+
genome = seq_id.split('|')[0] # Assuming genome name can be derived from the sequence ID
|
|
31
|
+
if genome not in longest_sequences or len(sequence) > len(longest_sequences[genome][1]):
|
|
32
|
+
longest_sequences[genome] = (seq_id, sequence)
|
|
33
|
+
return longest_sequences
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def run_mafft_on_sequences(sequences, output_file):
|
|
37
|
+
"""Run mafft on the given sequences and write to output file."""
|
|
38
|
+
# Create a temporary input file for mafft
|
|
39
|
+
with NamedTemporaryFile('w', delete=False) as temp_input_file:
|
|
40
|
+
for header, sequence in sequences.items():
|
|
41
|
+
temp_input_file.write(f">{header}\n{sequence}\n")
|
|
42
|
+
temp_input_file_path = temp_input_file.name
|
|
43
|
+
|
|
44
|
+
# Run mafft
|
|
45
|
+
try:
|
|
46
|
+
with open(output_file, 'w') as output_f:
|
|
47
|
+
subprocess.run(
|
|
48
|
+
['mafft', '--auto', temp_input_file_path],
|
|
49
|
+
stdout=output_f,
|
|
50
|
+
stderr=subprocess.DEVNULL, # Suppress stderr
|
|
51
|
+
check=True
|
|
52
|
+
)
|
|
53
|
+
finally:
|
|
54
|
+
os.remove(temp_input_file_path) # Clean up the temporary file
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def process_gene_families(directory, output_file):
|
|
58
|
+
"""Process each gene family file to select the longest sequence per genome and concatenate aligned sequences."""
|
|
59
|
+
concatenated_sequences = {}
|
|
60
|
+
output_file = directory.replace('Gene_Families_Output',output_file)
|
|
61
|
+
|
|
62
|
+
# Iterate over each gene family file
|
|
63
|
+
for gene_file in os.listdir(directory):
|
|
64
|
+
if gene_file.endswith('.fasta'):
|
|
65
|
+
gene_path = os.path.join(directory, gene_file)
|
|
66
|
+
|
|
67
|
+
# Read sequences from the gene family file
|
|
68
|
+
sequences = read_fasta(gene_path)
|
|
69
|
+
|
|
70
|
+
# Select the longest sequence for each genome
|
|
71
|
+
longest_sequences = select_longest_gene(sequences)
|
|
72
|
+
|
|
73
|
+
# Run mafft on the longest sequences
|
|
74
|
+
aligned_file = f"{gene_file}_aligned.fasta"
|
|
75
|
+
run_mafft_on_sequences({seq_id: seq for seq_id, seq in longest_sequences.values()}, aligned_file)
|
|
76
|
+
|
|
77
|
+
# Read aligned sequences and concatenate them
|
|
78
|
+
aligned_sequences = read_fasta(aligned_file)
|
|
79
|
+
for genome, aligned_seq in aligned_sequences.items():
|
|
80
|
+
genome_name = genome.split('|')[0]
|
|
81
|
+
if genome_name not in concatenated_sequences:
|
|
82
|
+
concatenated_sequences[genome_name] = ""
|
|
83
|
+
concatenated_sequences[genome_name] += aligned_seq
|
|
84
|
+
|
|
85
|
+
# Clean up aligned file
|
|
86
|
+
os.remove(aligned_file)
|
|
87
|
+
|
|
88
|
+
# Write the concatenated sequences to the output file
|
|
89
|
+
with open(output_file, 'w') as out:
|
|
90
|
+
for genome, sequence in concatenated_sequences.items():
|
|
91
|
+
out.write(f">{genome}\n")
|
|
92
|
+
wrapped_sequence = wrap_sequence(sequence, 60)
|
|
93
|
+
out.write(f"{wrapped_sequence}\n")
|
|
94
|
+
|
|
23
95
|
def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
|
|
24
96
|
print("Outputting gene_presence_absence file")
|
|
25
97
|
in_name = options.clusters.split('.')[0]
|
|
@@ -57,6 +129,27 @@ def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_
|
|
|
57
129
|
# edge_list_outfile.write(line + '\n')
|
|
58
130
|
|
|
59
131
|
|
|
132
|
+
def wrap_sequence(sequence, width=60):
|
|
133
|
+
wrapped_sequence = []
|
|
134
|
+
for i in range(0, len(sequence), width):
|
|
135
|
+
wrapped_sequence.append(sequence[i:i + width])
|
|
136
|
+
return "\n".join(wrapped_sequence)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def read_fasta(fasta_file):
|
|
140
|
+
sequences = {}
|
|
141
|
+
current_sequence = None
|
|
142
|
+
with open(fasta_file, 'r') as file:
|
|
143
|
+
for line in file:
|
|
144
|
+
line = line.strip()
|
|
145
|
+
if not line:
|
|
146
|
+
continue # Skip empty lines
|
|
147
|
+
if line.startswith('>'):
|
|
148
|
+
current_sequence = line[1:] # Remove '>' character
|
|
149
|
+
sequences[current_sequence] = ''
|
|
150
|
+
else:
|
|
151
|
+
sequences[current_sequence] += line
|
|
152
|
+
return sequences
|
|
60
153
|
|
|
61
154
|
|
|
62
155
|
def reorder_dict_by_keys(original_dict, sorted_keys):
|
|
@@ -71,31 +164,35 @@ def get_cores(options,genome_dict):
|
|
|
71
164
|
for group in options.core_groups.split(','):
|
|
72
165
|
calculated_floor = math.floor(int(group) / 100 * len(genome_dict))
|
|
73
166
|
if first == False:
|
|
74
|
-
|
|
167
|
+
# Ensure no overlap
|
|
168
|
+
# if calculated_floor <= prev_top:
|
|
169
|
+
# calculated_floor = prev_top - 1
|
|
170
|
+
|
|
171
|
+
groups[group] = (calculated_floor,prev_top)
|
|
75
172
|
else:
|
|
76
173
|
groups[group] = (calculated_floor, prev_top)
|
|
77
174
|
first = False
|
|
78
175
|
prev_top = calculated_floor
|
|
79
176
|
first_core_group = 'first_core_' + group
|
|
80
|
-
cores[first_core_group] =
|
|
177
|
+
cores[first_core_group] = []
|
|
81
178
|
if options.reclustered != None:
|
|
82
179
|
extended_core_group = 'extended_core_' + group
|
|
83
|
-
cores[extended_core_group] =
|
|
180
|
+
cores[extended_core_group] = []
|
|
84
181
|
combined_core_group = 'combined_core_' + group
|
|
85
|
-
cores[combined_core_group] =
|
|
182
|
+
cores[combined_core_group] = []
|
|
86
183
|
second_core_group = 'second_core_' + group
|
|
87
|
-
cores[second_core_group] =
|
|
184
|
+
cores[second_core_group] = []
|
|
88
185
|
only_second_core_group = 'only_second_core_' + group
|
|
89
|
-
cores[only_second_core_group] =
|
|
186
|
+
cores[only_second_core_group] = []
|
|
90
187
|
return cores, groups
|
|
91
188
|
|
|
92
189
|
#@profile
|
|
93
|
-
def calc_First_only_core(pep_num, groups, cores):
|
|
190
|
+
def calc_First_only_core(cluster, pep_num, groups, cores):
|
|
94
191
|
groups_as_list = list(groups.values())
|
|
95
192
|
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= pep_num <= fir):
|
|
96
193
|
res = idx
|
|
97
194
|
family_group = list(groups)[res]
|
|
98
|
-
cores['first_core_'+family_group]
|
|
195
|
+
cores['first_core_'+family_group].append(cluster)
|
|
99
196
|
|
|
100
197
|
#@profile
|
|
101
198
|
def calc_single_First_extended_Second_only_core(pep_num, groups, cores, second_num): # Count gene families extended with StORFs
|
|
@@ -103,7 +200,7 @@ def calc_single_First_extended_Second_only_core(pep_num, groups, cores, second_n
|
|
|
103
200
|
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= pep_num+second_num <= fir):
|
|
104
201
|
res = idx
|
|
105
202
|
family_group = list(groups)[res]
|
|
106
|
-
cores['extended_core_' + family_group]
|
|
203
|
+
cores['extended_core_' + family_group].append(pep_num)
|
|
107
204
|
|
|
108
205
|
|
|
109
206
|
#@profile
|
|
@@ -188,28 +285,28 @@ def combined_clustering_counting(options, pangenome_clusters_First, reps, combin
|
|
|
188
285
|
|
|
189
286
|
#@profile
|
|
190
287
|
def single_clustering_counting(options, pangenome_clusters_First, reps):
|
|
191
|
-
|
|
192
|
-
|
|
288
|
+
num_clustered_First = defaultdict(list)
|
|
289
|
+
recorded_First = []
|
|
193
290
|
pangenome_clusters_Type = copy.deepcopy(pangenome_clusters_First)
|
|
194
291
|
list_of_reps = list(reps.keys())
|
|
195
|
-
for cluster,
|
|
292
|
+
for cluster, First_genomes in pangenome_clusters_First.items():
|
|
196
293
|
rep = list_of_reps[int(cluster)] # get the rep of the current pep cluster
|
|
197
294
|
|
|
198
295
|
try: # get the cluster from the storf clusters which contains this rep
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
for
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
pangenome_clusters_Type[cluster] = [len(
|
|
207
|
-
|
|
296
|
+
num_clustered_First[cluster].append(rep + '_' + str(len(First_genomes)))
|
|
297
|
+
size_of_First_clusters = []
|
|
298
|
+
Firsts = num_clustered_First[cluster]
|
|
299
|
+
for First in Firsts:
|
|
300
|
+
First = First.rsplit('_', 1)
|
|
301
|
+
size_of_First_clusters.append(int(First[1]))
|
|
302
|
+
recorded_First.append(First[0])
|
|
303
|
+
pangenome_clusters_Type[cluster] = [len(num_clustered_First[cluster]), sum(size_of_First_clusters),
|
|
304
|
+
size_of_First_clusters, 0, 0, 0]
|
|
208
305
|
|
|
209
306
|
except KeyError:
|
|
210
307
|
###Singleton
|
|
211
|
-
num_pep_genomes = [len(
|
|
212
|
-
pangenome_clusters_Type[cluster] = [1, len(
|
|
308
|
+
num_pep_genomes = [len(First_genomes)]
|
|
309
|
+
pangenome_clusters_Type[cluster] = [1, len(First_genomes), num_pep_genomes, 0, 0, 0]
|
|
213
310
|
|
|
214
311
|
return pangenome_clusters_Type
|
|
215
312
|
|
|
@@ -472,7 +569,7 @@ def cluster(options):
|
|
|
472
569
|
pangenome_clusters_Type = single_clustering_counting(options, pangenome_clusters_First, reps)
|
|
473
570
|
|
|
474
571
|
|
|
475
|
-
|
|
572
|
+
|
|
476
573
|
Number_Of_StORF_Extending_But_Same_Genomes = 0
|
|
477
574
|
|
|
478
575
|
sorted_first_keys = sort_keys_by_values(pangenome_clusters_First, pangenome_clusters_First_sequences)
|
|
@@ -483,12 +580,12 @@ def cluster(options):
|
|
|
483
580
|
print("Calculating Groups")
|
|
484
581
|
for cluster, numbers in pangenome_clusters_Type_sorted.items():
|
|
485
582
|
############################### Calculate First only
|
|
486
|
-
if numbers[0] == 1 and numbers[1] >=2:
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
elif numbers[0] >1 and numbers[1] >=2:
|
|
490
|
-
|
|
491
|
-
|
|
583
|
+
#if numbers[0] == 1 and numbers[1] >=2:
|
|
584
|
+
calc_First_only_core(cluster, numbers[1],groups,cores)
|
|
585
|
+
|
|
586
|
+
# elif numbers[0] >1 and numbers[1] >=2:
|
|
587
|
+
# calc_First_only_core(cluster, numbers[2][0],groups,cores)
|
|
588
|
+
|
|
492
589
|
|
|
493
590
|
if options.reclustered != None:
|
|
494
591
|
############################# Calculate First and Reclustered-Second
|
|
@@ -511,17 +608,117 @@ def cluster(options):
|
|
|
511
608
|
if data[1] >= 2:
|
|
512
609
|
calc_only_Second_only_core(groups, cores, data[1])
|
|
513
610
|
###########################
|
|
514
|
-
print("End")
|
|
515
611
|
key_order = ['first_core_', 'extended_core_', 'combined_core_', 'second_core_','only_second_core_']
|
|
516
|
-
print("Gene
|
|
612
|
+
print("Gene Groups:")
|
|
517
613
|
for key_prefix in key_order:
|
|
518
614
|
for key, value in cores.items():
|
|
519
615
|
if key.startswith(key_prefix):
|
|
520
|
-
print(f"{key}: {value}")
|
|
616
|
+
print(f"{key}: {len(value)}")
|
|
617
|
+
print("Total Number of Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
|
|
521
618
|
|
|
522
619
|
if options.gene_presence_absence_out != None:
|
|
523
620
|
gene_presence_absence_output(options,genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted)
|
|
524
621
|
|
|
622
|
+
if options.write_families != None and options.fasta != None:
|
|
623
|
+
sequences = read_fasta(options.fasta)
|
|
624
|
+
input_dir = os.path.dirname(os.path.abspath(options.clusters))
|
|
625
|
+
output_dir = os.path.join(input_dir, 'Gene_Families_Output')
|
|
626
|
+
|
|
627
|
+
# Create output directory if it doesn't exist
|
|
628
|
+
if not os.path.exists(output_dir):
|
|
629
|
+
os.makedirs(output_dir)
|
|
630
|
+
for key_prefix in key_order:
|
|
631
|
+
for key, values in cores.items():
|
|
632
|
+
if any(part in options.write_families.split(',') for part in key.split('_')):
|
|
633
|
+
if key.startswith(key_prefix):
|
|
634
|
+
for value in values:
|
|
635
|
+
output_filename = f"{key}_{value}.fasta"
|
|
636
|
+
sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
|
|
637
|
+
# Write sequences to output file that are in the sequences dictionary
|
|
638
|
+
with open(os.path.join(output_dir, output_filename), 'w') as outfile:
|
|
639
|
+
for header in sequences_to_write:
|
|
640
|
+
if header in sequences:
|
|
641
|
+
outfile.write(f">{header}\n")
|
|
642
|
+
wrapped_sequence = wrap_sequence(sequences[header])
|
|
643
|
+
outfile.write(f"{wrapped_sequence}\n")
|
|
644
|
+
|
|
645
|
+
if options.con_core != None and options.fasta != None and options.write_families != None:
|
|
646
|
+
process_gene_families(os.path.join(input_dir, 'Gene_Families_Output'), 'concatonated_genes_aligned.fasta')
|
|
647
|
+
|
|
648
|
+
|
|
649
|
+
# groups_dir = os.path.join(input_dir, 'Gene_Families_Output')
|
|
650
|
+
# """Run mafft on all .fasta files in the given directory."""
|
|
651
|
+
# for filename in os.listdir(groups_dir):
|
|
652
|
+
# if filename.endswith('.fasta'):
|
|
653
|
+
# input_path = os.path.join(groups_dir, filename)
|
|
654
|
+
# output_filename = filename.replace('.fasta', '_mafft.aln')
|
|
655
|
+
# output_path = os.path.join(groups_dir, output_filename)
|
|
656
|
+
#
|
|
657
|
+
# # Call mafft command
|
|
658
|
+
# try:
|
|
659
|
+
# with open(output_path, 'w') as output_file:
|
|
660
|
+
# subprocess.run(
|
|
661
|
+
# ['mafft', '--auto', input_path],
|
|
662
|
+
# stdout=output_file,
|
|
663
|
+
# stderr=subprocess.DEVNULL, # Suppress stderr
|
|
664
|
+
# check=True
|
|
665
|
+
# )
|
|
666
|
+
# print(f"Processed {input_path} -> {output_path}")
|
|
667
|
+
# except subprocess.CalledProcessError as e:
|
|
668
|
+
# print(f"Failed to process {input_path}: {e}")
|
|
669
|
+
|
|
670
|
+
##This could be run once and not above AND here..
|
|
671
|
+
# output_dir = os.path.dirname(os.path.abspath(options.clusters))
|
|
672
|
+
# sequences = read_fasta(options.fasta)
|
|
673
|
+
# concatenated_sequences = {genome: '' for genome in genome_dict.keys()}
|
|
674
|
+
#
|
|
675
|
+
#
|
|
676
|
+
# for key_prefix in key_order:
|
|
677
|
+
# for key, values in cores.items():
|
|
678
|
+
# if any(part in options.con_core.split(',') for part in key.split('_')):
|
|
679
|
+
# if key.startswith(key_prefix):
|
|
680
|
+
# for value in values:
|
|
681
|
+
# length_capture = {genome: [] for genome in genome_dict.keys()}
|
|
682
|
+
# sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
|
|
683
|
+
# for header in sequences_to_write:
|
|
684
|
+
# if header in sequences:
|
|
685
|
+
# length_capture[header.split('|')[0]].append([header,len(sequences[header])])
|
|
686
|
+
# if all(bool(values) for values in length_capture.values()): # If a GF is not present in 'ALL' genomes, do not add to concat
|
|
687
|
+
# for genome, lengths in length_capture.items():
|
|
688
|
+
# max_value = float('-inf')
|
|
689
|
+
# max_item = None
|
|
690
|
+
# for length in lengths:
|
|
691
|
+
# current_value = length[1]
|
|
692
|
+
# if current_value > max_value:
|
|
693
|
+
# max_value = current_value
|
|
694
|
+
# max_item = length[0]
|
|
695
|
+
# concatenated_sequences[genome.split('|')[0]] += sequences[max_item]
|
|
696
|
+
#
|
|
697
|
+
#
|
|
698
|
+
# with open(os.path.join(output_dir, 'core_concat.fasta'), 'w') as outfile:
|
|
699
|
+
# for genome, sequence in concatenated_sequences.items():
|
|
700
|
+
# outfile.write(f">{genome}\n")
|
|
701
|
+
# wrapped_sequence = wrap_sequence(sequence)
|
|
702
|
+
# outfile.write(f"{wrapped_sequence}\n")
|
|
703
|
+
|
|
704
|
+
|
|
705
|
+
# for core_gene_family in core_gene_families:
|
|
706
|
+
# found_sequences = {genome: False for genome in genomes}
|
|
707
|
+
#
|
|
708
|
+
# for fasta_file in fasta_files:
|
|
709
|
+
# sequences = read_fasta(fasta_file)
|
|
710
|
+
# for header, sequence in sequences.items():
|
|
711
|
+
# genome = header.split('|')[0]
|
|
712
|
+
# if genome in genomes and core_gene_family in header:
|
|
713
|
+
# concatenated_sequences[genome] += sequence
|
|
714
|
+
# found_sequences[genome] = True
|
|
715
|
+
#
|
|
716
|
+
# for genome in genomes:
|
|
717
|
+
# if not found_sequences[genome]:
|
|
718
|
+
# concatenated_sequences[genome] += '-' * len(next(iter(sequences.values())))
|
|
719
|
+
|
|
720
|
+
|
|
721
|
+
|
|
525
722
|
|
|
526
723
|
def main():
|
|
527
724
|
|
|
@@ -534,14 +731,24 @@ def main():
|
|
|
534
731
|
required.add_argument('-f', action='store', dest='format', choices=['CD-HIT', 'CSV', 'TSV'],
|
|
535
732
|
help='Which format to use (CD-HIT or Comma/Tab Separated Edge-List (such as MMseqs2 tsv output))', required=True)
|
|
536
733
|
|
|
734
|
+
output_args = parser.add_argument_group('Output Parameters')
|
|
735
|
+
output_args.add_argument('-w', action="store", dest='write_families', default=None,
|
|
736
|
+
help='Default - No output: Output sequences of identified families (provide levels at which to output "-w 99,95"'
|
|
737
|
+
' - Must provide FASTA file with -fasta')
|
|
738
|
+
output_args.add_argument('-con', action="store", dest='con_core', default=None,
|
|
739
|
+
help='Default - No output: Output aligned and concatinated sequences of identified families - used for MSA (provide levels at which to output "-w 99,95"'
|
|
740
|
+
' - Must provide FASTA file with -fasta')
|
|
741
|
+
output_args.add_argument('-fasta', action='store', dest='fasta',
|
|
742
|
+
help='FASTA file to use in conjunction with "-w" or "-con"',
|
|
743
|
+
required=False)
|
|
537
744
|
|
|
538
745
|
optional = parser.add_argument_group('Optional Arguments')
|
|
539
746
|
optional.add_argument('-rc', action='store', dest='reclustered', help='Clustering output file from secondary round of clustering',
|
|
540
747
|
required=False)
|
|
541
748
|
optional.add_argument('-st', action='store', dest='sequence_tag', help='Default - "StORF": Unique identifier to be used to distinguish the second of two rounds of clustered sequences',
|
|
542
749
|
required=False)
|
|
543
|
-
optional.add_argument('-groups', action="store", dest='core_groups', default="99,
|
|
544
|
-
help='Default - (\'99,95,
|
|
750
|
+
optional.add_argument('-groups', action="store", dest='core_groups', default="99,95,15",
|
|
751
|
+
help='Default - (\'99,95,15\'): Gene family groups to use')
|
|
545
752
|
optional.add_argument('-gpa', action='store', dest='gene_presence_absence_out', help='Default - False: If selected, a Roary formatted gene_presence_absence.csv will be created - Required for Coinfinder and other downstream tools',
|
|
546
753
|
required=False)
|
|
547
754
|
|
|
@@ -562,6 +769,16 @@ def main():
|
|
|
562
769
|
if options.sequence_tag == None:
|
|
563
770
|
options.sequence_tag = 'StORF'
|
|
564
771
|
|
|
772
|
+
if options.con_core == True:
|
|
773
|
+
if is_tool_installed('mafft'):
|
|
774
|
+
print("mafft is installed. Proceeding with alignment.")
|
|
775
|
+
else:
|
|
776
|
+
print("mafft is not installed. Please install mafft to proceed.")
|
|
777
|
+
|
|
778
|
+
if options.write_families != None and options.fasta == False:
|
|
779
|
+
exit("-fasta must br provided if -w is used")
|
|
780
|
+
|
|
781
|
+
|
|
565
782
|
options.clusters = os.path.normpath(options.clusters)
|
|
566
783
|
options.clusters = os.path.realpath(options.clusters)
|
|
567
784
|
if options.reclustered:
|
|
@@ -573,6 +790,10 @@ def main():
|
|
|
573
790
|
|
|
574
791
|
cluster(options)
|
|
575
792
|
|
|
793
|
+
|
|
794
|
+
|
|
795
|
+
|
|
796
|
+
|
|
576
797
|
print("Thank you for using PyamilySeq -- A detailed user manual can be found at https://github.com/NickJD/PyamilySeq\n"
|
|
577
798
|
"Please report any issues to: https://github.com/NickJD/PyamilySeq/issues\n#####")
|
|
578
799
|
|
|
@@ -582,5 +803,5 @@ def main():
|
|
|
582
803
|
|
|
583
804
|
if __name__ == "__main__":
|
|
584
805
|
main()
|
|
585
|
-
print("
|
|
806
|
+
print("Done")
|
|
586
807
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: PyamilySeq
|
|
3
|
-
Version: 0.0
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: PyamilySeq - A a tool to look for sequence-based gene families identified by clustering methods such as CD-HIT, DIAMOND, BLAST or MMseqs2.
|
|
5
5
|
Home-page: https://github.com/NickJD/PyamilySeq
|
|
6
6
|
Author: Nicholas Dimonaco
|
|
@@ -13,7 +13,7 @@ Requires-Python: >=3.6
|
|
|
13
13
|
Description-Content-Type: text/markdown
|
|
14
14
|
License-File: LICENSE
|
|
15
15
|
|
|
16
|
-
# PyamilySeq
|
|
16
|
+
# PyamilySeq - !BETA!
|
|
17
17
|
PyamilySeq (Family Seek) is a Python tool for clustering gene sequences into families based on sequence similarity identified by tools such as CD-HIT, DIAMOND or MMseqs2.
|
|
18
18
|
This work is an extension of the gene family / pangenome tool developed for the StORF-Reporter publication in NAR (https://doi.org/10.1093/nar/gkad814).
|
|
19
19
|
|
|
@@ -31,7 +31,39 @@ PyamilySeq requires Python 3.6 or higher. Install dependencies using pip:
|
|
|
31
31
|
pip install PyamilySeq
|
|
32
32
|
```
|
|
33
33
|
|
|
34
|
-
## Usage
|
|
34
|
+
## Usage - Menu
|
|
35
|
+
```
|
|
36
|
+
usage: PyamilySeq_Species.py [-h] -c CLUSTERS -f {CD-HIT,CSV,TSV} [-w WRITE_FAMILIES] [-con CON_CORE] [-fasta FASTA] [-rc RECLUSTERED] [-st SEQUENCE_TAG]
|
|
37
|
+
[-groups CORE_GROUPS] [-gpa GENE_PRESENCE_ABSENCE_OUT] [-verbose {True,False}] [-v]
|
|
38
|
+
|
|
39
|
+
PyamilySeq v0.3.0: PyamilySeq Run Parameters.
|
|
40
|
+
|
|
41
|
+
Required Arguments:
|
|
42
|
+
-c CLUSTERS Clustering output file from CD-HIT, TSV or CSV Edge List
|
|
43
|
+
-f {CD-HIT,CSV,TSV} Which format to use (CD-HIT or Comma/Tab Separated Edge-List (such as MMseqs2 tsv output))
|
|
44
|
+
|
|
45
|
+
Output Parameters:
|
|
46
|
+
-w WRITE_FAMILIES Default - No output: Output sequences of identified families (provide levels at which to output "-w 99,95" - Must provide FASTA file
|
|
47
|
+
with -fasta
|
|
48
|
+
-con CON_CORE Default - No output: Output aligned and concatinated sequences of identified families - used for MSA (provide levels at which to
|
|
49
|
+
output "-w 99,95" - Must provide FASTA file with -fasta
|
|
50
|
+
-fasta FASTA FASTA file to use in conjunction with "-w" or "-con"
|
|
51
|
+
|
|
52
|
+
Optional Arguments:
|
|
53
|
+
-rc RECLUSTERED Clustering output file from secondary round of clustering
|
|
54
|
+
-st SEQUENCE_TAG Default - "StORF": Unique identifier to be used to distinguish the second of two rounds of clustered sequences
|
|
55
|
+
-groups CORE_GROUPS Default - ('99,95,15'): Gene family groups to use
|
|
56
|
+
-gpa GENE_PRESENCE_ABSENCE_OUT
|
|
57
|
+
Default - False: If selected, a Roary formatted gene_presence_absence.csv will be created - Required for Coinfinder and other
|
|
58
|
+
downstream tools
|
|
59
|
+
|
|
60
|
+
Misc:
|
|
61
|
+
-verbose {True,False}
|
|
62
|
+
Default - False: Print out runtime messages
|
|
63
|
+
-v Default - False: Print out version number and exit
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
```
|
|
35
67
|
|
|
36
68
|
### Clustering Analysis
|
|
37
69
|
|
|
@@ -58,6 +90,7 @@ Replace `reclustered_file` with the path to the file containing additional seque
|
|
|
58
90
|
PyamilySeq generates various outputs, including:
|
|
59
91
|
|
|
60
92
|
- **Gene Presence-Absence File**: This CSV file details the presence and absence of genes across genomes.
|
|
93
|
+
- **FASTA Files for Each Gene Family**:
|
|
61
94
|
|
|
62
95
|
## Gene Family Groups
|
|
63
96
|
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
PyamilySeq/CD-Hit_StORF-Reporter_Cross-Genera_Builder.py,sha256=UzQ5iOKCNfurxmj1pnkowF11YfWBO5vnBCKxQK6goB8,26538
|
|
2
|
+
PyamilySeq/Constants.py,sha256=PdgSIux2jfv6QlAOxRIFgbsH95Xq6DMQcvZodGsk7tw,399
|
|
3
|
+
PyamilySeq/PyamilySeq_Species.py,sha256=zLGfyTtxk4znoUevyjfb978pT3XNjWu44-8Seqnl7ec,38961
|
|
4
|
+
PyamilySeq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
+
PyamilySeq/combine_FASTA_with_genome_IDs.py,sha256=aMUVSk6jKnKX0g04RMM360QueZS83lRLqLLysBtQbLo,2009
|
|
6
|
+
PyamilySeq-0.3.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
7
|
+
PyamilySeq-0.3.0.dist-info/METADATA,sha256=iJOcBDtkFBUZTFGQMBwdUuaZnKAO1I9Pc-YFgvvhySQ,4382
|
|
8
|
+
PyamilySeq-0.3.0.dist-info/WHEEL,sha256=-oYQCr74JF3a37z2nRlQays_SX2MqOANoqVjBBAP2yE,91
|
|
9
|
+
PyamilySeq-0.3.0.dist-info/entry_points.txt,sha256=zGtA2Ycf0LG3PR7zuuT0wjaAKLFxtyGgBc0O_W7E250,66
|
|
10
|
+
PyamilySeq-0.3.0.dist-info/top_level.txt,sha256=J6JhugUQTq4rq96yibAlQu3o4KCM9WuYfqr3w1r119M,11
|
|
11
|
+
PyamilySeq-0.3.0.dist-info/RECORD,,
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
PyamilySeq/CD-Hit_StORF-Reporter_Cross-Genera_Builder.py,sha256=UzQ5iOKCNfurxmj1pnkowF11YfWBO5vnBCKxQK6goB8,26538
|
|
2
|
-
PyamilySeq/Constants.py,sha256=hrbTdmPUFEzLfGZOPoQPV0NsAG-VnfIX51291vqb1C8,30
|
|
3
|
-
PyamilySeq/PyamilySeq_Species.py,sha256=34NHcViENyAdvGRltNUbfWjEcNCYnsmbuhDdl8__mH0,28209
|
|
4
|
-
PyamilySeq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
-
PyamilySeq/combine_FASTA_with_genome_IDs.py,sha256=aMUVSk6jKnKX0g04RMM360QueZS83lRLqLLysBtQbLo,2009
|
|
6
|
-
PyamilySeq-0.0.2.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
7
|
-
PyamilySeq-0.0.2.dist-info/METADATA,sha256=v6hOL3kekqt8H5YhjpS6uQOF1QSFcBh4Zy-jNW3xDTk,2550
|
|
8
|
-
PyamilySeq-0.0.2.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
|
|
9
|
-
PyamilySeq-0.0.2.dist-info/entry_points.txt,sha256=zGtA2Ycf0LG3PR7zuuT0wjaAKLFxtyGgBc0O_W7E250,66
|
|
10
|
-
PyamilySeq-0.0.2.dist-info/top_level.txt,sha256=J6JhugUQTq4rq96yibAlQu3o4KCM9WuYfqr3w1r119M,11
|
|
11
|
-
PyamilySeq-0.0.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|