PyamilySeq 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyamilySeq/Constants.py +15 -1
- PyamilySeq/PyamilySeq.py +186 -0
- PyamilySeq/PyamilySeq_Species.py +200 -35
- PyamilySeq-0.4.0.dist-info/METADATA +92 -0
- PyamilySeq-0.4.0.dist-info/RECORD +12 -0
- {PyamilySeq-0.2.0.dist-info → PyamilySeq-0.4.0.dist-info}/WHEEL +1 -1
- PyamilySeq-0.4.0.dist-info/entry_points.txt +2 -0
- PyamilySeq-0.2.0.dist-info/METADATA +0 -101
- PyamilySeq-0.2.0.dist-info/RECORD +0 -11
- PyamilySeq-0.2.0.dist-info/entry_points.txt +0 -2
- {PyamilySeq-0.2.0.dist-info → PyamilySeq-0.4.0.dist-info}/LICENSE +0 -0
- {PyamilySeq-0.2.0.dist-info → PyamilySeq-0.4.0.dist-info}/top_level.txt +0 -0
PyamilySeq/Constants.py
CHANGED
|
@@ -1 +1,15 @@
|
|
|
1
|
-
|
|
1
|
+
import subprocess
|
|
2
|
+
|
|
3
|
+
PyamilySeq_Version = 'v0.4.0'
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def is_tool_installed(tool_name):
|
|
8
|
+
"""Check if a tool is installed and available in PATH."""
|
|
9
|
+
try:
|
|
10
|
+
subprocess.run([tool_name, '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
|
|
11
|
+
return True
|
|
12
|
+
except subprocess.CalledProcessError:
|
|
13
|
+
return False
|
|
14
|
+
except FileNotFoundError:
|
|
15
|
+
return False
|
PyamilySeq/PyamilySeq.py
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import collections
|
|
3
|
+
import os
|
|
4
|
+
import glob
|
|
5
|
+
import subprocess
|
|
6
|
+
from PyamilySeq_Species import *
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
from .PyamilySeq_Species import cluster
|
|
11
|
+
from .Constants import *
|
|
12
|
+
except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
|
|
13
|
+
from PyamilySeq_Species import cluster
|
|
14
|
+
from Constants import *
|
|
15
|
+
|
|
16
|
+
def reverse_complement(seq):
|
|
17
|
+
complement = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'N': 'N'}
|
|
18
|
+
return ''.join(complement[base] for base in reversed(seq))
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def read_separate_files(input_dir, name_split, combined_out):
|
|
22
|
+
with open(combined_out, 'w') as combined_out_file:
|
|
23
|
+
for fasta_file in glob.glob(os.path.join(input_dir, '*' + name_split)):
|
|
24
|
+
genome_name = os.path.basename(fasta_file).split(name_split)[0]
|
|
25
|
+
corresponding_gff_file = fasta_file.replace('.fasta', '.gff')
|
|
26
|
+
if not os.path.exists(corresponding_gff_file):
|
|
27
|
+
continue
|
|
28
|
+
cds_sequences = extract_cds_from_gff(fasta_file, corresponding_gff_file)
|
|
29
|
+
for gene_name, seq in cds_sequences:
|
|
30
|
+
header = f">{genome_name}_{gene_name}\n"
|
|
31
|
+
combined_out_file.write(header)
|
|
32
|
+
combined_out_file.write(seq + '\n')
|
|
33
|
+
|
|
34
|
+
def read_combined_files(input_dir, name_split, combined_out):
|
|
35
|
+
with open(combined_out, 'w') as combined_out_file:
|
|
36
|
+
for gff_file in glob.glob(os.path.join(input_dir, '*' + name_split)):
|
|
37
|
+
genome_name = os.path.basename(gff_file).split(name_split)[0]
|
|
38
|
+
fasta_dict = collections.defaultdict(str)
|
|
39
|
+
gff_features = []
|
|
40
|
+
with open(gff_file, 'r') as file:
|
|
41
|
+
lines = file.readlines()
|
|
42
|
+
fasta_section = False
|
|
43
|
+
for line in lines:
|
|
44
|
+
if line.startswith('##FASTA'):
|
|
45
|
+
fasta_section = True
|
|
46
|
+
continue
|
|
47
|
+
if fasta_section:
|
|
48
|
+
if line.startswith('>'):
|
|
49
|
+
current_contig = line[1:].split()[0]
|
|
50
|
+
fasta_dict[current_contig] = []
|
|
51
|
+
else:
|
|
52
|
+
fasta_dict[current_contig].append(line.strip())
|
|
53
|
+
else:
|
|
54
|
+
line_data = line.split('\t')
|
|
55
|
+
if len(line_data) == 9:
|
|
56
|
+
if line_data[2] == 'CDS':
|
|
57
|
+
contig = line_data[0]
|
|
58
|
+
feature = line_data[2]
|
|
59
|
+
start, end = int(line_data[3]), int(line_data[4])
|
|
60
|
+
seq_id = line_data[8].split('ID=')[1].split(';')[0]
|
|
61
|
+
gff_features.append((contig, start, end, seq_id))
|
|
62
|
+
|
|
63
|
+
if fasta_dict and gff_features:
|
|
64
|
+
for contig, start, end, seq_id in gff_features:
|
|
65
|
+
if contig in fasta_dict:
|
|
66
|
+
full_sequence = ''.join(fasta_dict[contig])
|
|
67
|
+
cds_sequence = full_sequence[start - 1:end]
|
|
68
|
+
wrapped_sequence = '\n'.join([cds_sequence[i:i + 60] for i in range(0, len(cds_sequence), 60)])
|
|
69
|
+
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def run_cd_hit(input_file, clustering_output, options):
|
|
73
|
+
cdhit_command = [
|
|
74
|
+
'cd-hit-est',
|
|
75
|
+
'-i', input_file,
|
|
76
|
+
'-o', clustering_output,
|
|
77
|
+
'-c', str(options.pident),
|
|
78
|
+
'-s', str(options.len_diff),
|
|
79
|
+
'-T', "20",
|
|
80
|
+
'-d', "0",
|
|
81
|
+
'-sc', "1",
|
|
82
|
+
'-sf', "1"
|
|
83
|
+
]
|
|
84
|
+
subprocess.run(cdhit_command)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def main():
|
|
94
|
+
parser = argparse.ArgumentParser(
|
|
95
|
+
description='PyamilySeq ' + PyamilySeq_Version + ': PyamilySeq Run Parameters.')
|
|
96
|
+
required = parser.add_argument_group('Required Arguments')
|
|
97
|
+
required.add_argument("-id", action="store", dest="input_dir",
|
|
98
|
+
help="Directory containing GFF/FASTA files.",
|
|
99
|
+
required=True)
|
|
100
|
+
required.add_argument("-od", action="store", dest="output_dir",
|
|
101
|
+
help="Directory for all output files.",
|
|
102
|
+
required=True)
|
|
103
|
+
required.add_argument("-it", action="store", dest="input_type", choices=['separate', 'combined'],
|
|
104
|
+
help="Type of input files: 'separate' for separate FASTA and GFF files,"
|
|
105
|
+
" 'combined' for GFF files with embedded FASTA sequences.",
|
|
106
|
+
required=True)
|
|
107
|
+
required.add_argument("-ns", action="store", dest="name_split",
|
|
108
|
+
help="Character used to split the filename and extract the genome name.",
|
|
109
|
+
required=True)
|
|
110
|
+
required.add_argument("-pid", action="store", dest="pident", type=float,
|
|
111
|
+
help="Pident threshold for CD-HIT clustering.",
|
|
112
|
+
required=True)
|
|
113
|
+
required.add_argument("-ld", action="store", dest="len_diff", type=float,
|
|
114
|
+
help="Length difference (-s) threshold for CD-HIT clustering.",
|
|
115
|
+
required=True)
|
|
116
|
+
required.add_argument("-co", action="store", dest="clustering_out",
|
|
117
|
+
help="Output file for initial clustering.",
|
|
118
|
+
required=True)
|
|
119
|
+
required.add_argument("-ct", action="store", dest="clustering_type", choices=['CD-HIT', 'BLAST', 'DIAMOND', "MMseqs2"],
|
|
120
|
+
help="Clustering format for PyamilySeq.",
|
|
121
|
+
required=True)
|
|
122
|
+
|
|
123
|
+
output_args = parser.add_argument_group('Output Parameters')
|
|
124
|
+
output_args.add_argument('-w', action="store", dest='write_families', default=None,
|
|
125
|
+
help='Default - No output: Output sequences of identified families (provide levels at which to output "-w 99,95"'
|
|
126
|
+
' - Must provide FASTA file with -fasta',
|
|
127
|
+
required=False)
|
|
128
|
+
output_args.add_argument('-con', action="store", dest='con_core', default=None,
|
|
129
|
+
help='Default - No output: Output aligned and concatinated sequences of identified families - used for MSA (provide levels at which to output "-w 99,95"'
|
|
130
|
+
' - Must provide FASTA file with -fasta',
|
|
131
|
+
required=False)
|
|
132
|
+
output_args.add_argument('-fasta', action='store', dest='fasta',
|
|
133
|
+
help='FASTA file to use in conjunction with "-w" or "-con"',
|
|
134
|
+
required=False)
|
|
135
|
+
|
|
136
|
+
optional = parser.add_argument_group('Optional Arguments')
|
|
137
|
+
optional.add_argument('-rc', action='store', dest='reclustered', help='Clustering output file from secondary round of clustering',
|
|
138
|
+
required=False)
|
|
139
|
+
optional.add_argument('-st', action='store', dest='sequence_tag', help='Default - "StORF": Unique identifier to be used to distinguish the second of two rounds of clustered sequences',
|
|
140
|
+
required=False)
|
|
141
|
+
optional.add_argument('-groups', action="store", dest='core_groups', default="99,95,15",
|
|
142
|
+
help='Default - (\'99,95,15\'): Gene family groups to use')
|
|
143
|
+
optional.add_argument('-gpa', action='store', dest='gene_presence_absence_out', help='Default - False: If selected, a Roary formatted gene_presence_absence.csv will be created - Required for Coinfinder and other downstream tools',
|
|
144
|
+
required=False)
|
|
145
|
+
|
|
146
|
+
parser.add_argument("pyamilyseq_args", nargs=argparse.REMAINDER, help="Additional arguments for PyamilySeq.")
|
|
147
|
+
options = parser.parse_args()
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
output_path = os.path.abspath(options.output_dir)
|
|
152
|
+
combined_out_file = os.path.join(output_path,"end_to_end_combined_sequences.fasta")
|
|
153
|
+
clustering_output = os.path.join(output_path,'clustering_'+options.clustering_type)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
# Step 1: Read and rename sequences from files based on input type
|
|
158
|
+
if options.input_type == 'separate':
|
|
159
|
+
read_separate_files(options.input_dir, options.name_split, combined_out_file)
|
|
160
|
+
else:
|
|
161
|
+
read_combined_files(options.input_dir, options.name_split, combined_out_file)
|
|
162
|
+
|
|
163
|
+
# Step 2: Run CD-HIT on the renamed sequences
|
|
164
|
+
run_cd_hit(combined_out_file, clustering_output, options)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
class clustering_options:
|
|
168
|
+
def __init__(self):
|
|
169
|
+
self.format = 'CD-HIT'
|
|
170
|
+
self.reclustered = options.reclustered
|
|
171
|
+
self.sequence_tag = 'StORF'
|
|
172
|
+
self.core_groups = '99,95,15,0'
|
|
173
|
+
self.clusters = clustering_output+'.clstr'
|
|
174
|
+
self.gene_presence_absence_out = options.gene_presence_absence_out
|
|
175
|
+
self.write_families = options.write_families
|
|
176
|
+
self.con_core = options.con_core
|
|
177
|
+
|
|
178
|
+
clustering_options = clustering_options()
|
|
179
|
+
|
|
180
|
+
# Step 3: Run PyamilySeq with the CD-HIT output
|
|
181
|
+
cluster(clustering_options)
|
|
182
|
+
#run_pyamilyseq(options.clustering_out, options.clustering_type, combined_out_file, options.pyamilyseq_args)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
if __name__ == "__main__":
|
|
186
|
+
main()
|
PyamilySeq/PyamilySeq_Species.py
CHANGED
|
@@ -6,6 +6,9 @@ import math
|
|
|
6
6
|
import sys
|
|
7
7
|
import argparse
|
|
8
8
|
import os
|
|
9
|
+
from tempfile import NamedTemporaryFile
|
|
10
|
+
|
|
11
|
+
|
|
9
12
|
|
|
10
13
|
try:
|
|
11
14
|
from .Constants import *
|
|
@@ -20,6 +23,75 @@ def sort_keys_by_values(dict1, dict2):
|
|
|
20
23
|
sorted_keys = sorted(dict1.keys(), key=lambda k: custom_sort_key(k, dict1, dict2), reverse=True)
|
|
21
24
|
return sorted_keys
|
|
22
25
|
|
|
26
|
+
def select_longest_gene(sequences):
|
|
27
|
+
"""Select the longest sequence for each genome."""
|
|
28
|
+
longest_sequences = {}
|
|
29
|
+
for seq_id, sequence in sequences.items():
|
|
30
|
+
genome = seq_id.split('|')[0] # Assuming genome name can be derived from the sequence ID
|
|
31
|
+
if genome not in longest_sequences or len(sequence) > len(longest_sequences[genome][1]):
|
|
32
|
+
longest_sequences[genome] = (seq_id, sequence)
|
|
33
|
+
return longest_sequences
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def run_mafft_on_sequences(sequences, output_file):
|
|
37
|
+
"""Run mafft on the given sequences and write to output file."""
|
|
38
|
+
# Create a temporary input file for mafft
|
|
39
|
+
with NamedTemporaryFile('w', delete=False) as temp_input_file:
|
|
40
|
+
for header, sequence in sequences.items():
|
|
41
|
+
temp_input_file.write(f">{header}\n{sequence}\n")
|
|
42
|
+
temp_input_file_path = temp_input_file.name
|
|
43
|
+
|
|
44
|
+
# Run mafft
|
|
45
|
+
try:
|
|
46
|
+
with open(output_file, 'w') as output_f:
|
|
47
|
+
subprocess.run(
|
|
48
|
+
['mafft', '--auto', temp_input_file_path],
|
|
49
|
+
stdout=output_f,
|
|
50
|
+
stderr=subprocess.DEVNULL, # Suppress stderr
|
|
51
|
+
check=True
|
|
52
|
+
)
|
|
53
|
+
finally:
|
|
54
|
+
os.remove(temp_input_file_path) # Clean up the temporary file
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def process_gene_families(directory, output_file):
|
|
58
|
+
"""Process each gene family file to select the longest sequence per genome and concatenate aligned sequences."""
|
|
59
|
+
concatenated_sequences = {}
|
|
60
|
+
output_file = directory.replace('Gene_Families_Output',output_file)
|
|
61
|
+
|
|
62
|
+
# Iterate over each gene family file
|
|
63
|
+
for gene_file in os.listdir(directory):
|
|
64
|
+
if gene_file.endswith('.fasta'):
|
|
65
|
+
gene_path = os.path.join(directory, gene_file)
|
|
66
|
+
|
|
67
|
+
# Read sequences from the gene family file
|
|
68
|
+
sequences = read_fasta(gene_path)
|
|
69
|
+
|
|
70
|
+
# Select the longest sequence for each genome
|
|
71
|
+
longest_sequences = select_longest_gene(sequences)
|
|
72
|
+
|
|
73
|
+
# Run mafft on the longest sequences
|
|
74
|
+
aligned_file = f"{gene_file}_aligned.fasta"
|
|
75
|
+
run_mafft_on_sequences({seq_id: seq for seq_id, seq in longest_sequences.values()}, aligned_file)
|
|
76
|
+
|
|
77
|
+
# Read aligned sequences and concatenate them
|
|
78
|
+
aligned_sequences = read_fasta(aligned_file)
|
|
79
|
+
for genome, aligned_seq in aligned_sequences.items():
|
|
80
|
+
genome_name = genome.split('|')[0]
|
|
81
|
+
if genome_name not in concatenated_sequences:
|
|
82
|
+
concatenated_sequences[genome_name] = ""
|
|
83
|
+
concatenated_sequences[genome_name] += aligned_seq
|
|
84
|
+
|
|
85
|
+
# Clean up aligned file
|
|
86
|
+
os.remove(aligned_file)
|
|
87
|
+
|
|
88
|
+
# Write the concatenated sequences to the output file
|
|
89
|
+
with open(output_file, 'w') as out:
|
|
90
|
+
for genome, sequence in concatenated_sequences.items():
|
|
91
|
+
out.write(f">{genome}\n")
|
|
92
|
+
wrapped_sequence = wrap_sequence(sequence, 60)
|
|
93
|
+
out.write(f"{wrapped_sequence}\n")
|
|
94
|
+
|
|
23
95
|
def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
|
|
24
96
|
print("Outputting gene_presence_absence file")
|
|
25
97
|
in_name = options.clusters.split('.')[0]
|
|
@@ -92,7 +164,11 @@ def get_cores(options,genome_dict):
|
|
|
92
164
|
for group in options.core_groups.split(','):
|
|
93
165
|
calculated_floor = math.floor(int(group) / 100 * len(genome_dict))
|
|
94
166
|
if first == False:
|
|
95
|
-
|
|
167
|
+
# Ensure no overlap
|
|
168
|
+
# if calculated_floor <= prev_top:
|
|
169
|
+
# calculated_floor = prev_top - 1
|
|
170
|
+
|
|
171
|
+
groups[group] = (calculated_floor,prev_top)
|
|
96
172
|
else:
|
|
97
173
|
groups[group] = (calculated_floor, prev_top)
|
|
98
174
|
first = False
|
|
@@ -209,28 +285,28 @@ def combined_clustering_counting(options, pangenome_clusters_First, reps, combin
|
|
|
209
285
|
|
|
210
286
|
#@profile
|
|
211
287
|
def single_clustering_counting(options, pangenome_clusters_First, reps):
|
|
212
|
-
|
|
213
|
-
|
|
288
|
+
num_clustered_First = defaultdict(list)
|
|
289
|
+
recorded_First = []
|
|
214
290
|
pangenome_clusters_Type = copy.deepcopy(pangenome_clusters_First)
|
|
215
291
|
list_of_reps = list(reps.keys())
|
|
216
|
-
for cluster,
|
|
292
|
+
for cluster, First_genomes in pangenome_clusters_First.items():
|
|
217
293
|
rep = list_of_reps[int(cluster)] # get the rep of the current pep cluster
|
|
218
294
|
|
|
219
295
|
try: # get the cluster from the storf clusters which contains this rep
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
for
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
pangenome_clusters_Type[cluster] = [len(
|
|
228
|
-
|
|
296
|
+
num_clustered_First[cluster].append(rep + '_' + str(len(First_genomes)))
|
|
297
|
+
size_of_First_clusters = []
|
|
298
|
+
Firsts = num_clustered_First[cluster]
|
|
299
|
+
for First in Firsts:
|
|
300
|
+
First = First.rsplit('_', 1)
|
|
301
|
+
size_of_First_clusters.append(int(First[1]))
|
|
302
|
+
recorded_First.append(First[0])
|
|
303
|
+
pangenome_clusters_Type[cluster] = [len(num_clustered_First[cluster]), sum(size_of_First_clusters),
|
|
304
|
+
size_of_First_clusters, 0, 0, 0]
|
|
229
305
|
|
|
230
306
|
except KeyError:
|
|
231
307
|
###Singleton
|
|
232
|
-
num_pep_genomes = [len(
|
|
233
|
-
pangenome_clusters_Type[cluster] = [1, len(
|
|
308
|
+
num_pep_genomes = [len(First_genomes)]
|
|
309
|
+
pangenome_clusters_Type[cluster] = [1, len(First_genomes), num_pep_genomes, 0, 0, 0]
|
|
234
310
|
|
|
235
311
|
return pangenome_clusters_Type
|
|
236
312
|
|
|
@@ -493,7 +569,7 @@ def cluster(options):
|
|
|
493
569
|
pangenome_clusters_Type = single_clustering_counting(options, pangenome_clusters_First, reps)
|
|
494
570
|
|
|
495
571
|
|
|
496
|
-
|
|
572
|
+
|
|
497
573
|
Number_Of_StORF_Extending_But_Same_Genomes = 0
|
|
498
574
|
|
|
499
575
|
sorted_first_keys = sort_keys_by_values(pangenome_clusters_First, pangenome_clusters_First_sequences)
|
|
@@ -504,12 +580,12 @@ def cluster(options):
|
|
|
504
580
|
print("Calculating Groups")
|
|
505
581
|
for cluster, numbers in pangenome_clusters_Type_sorted.items():
|
|
506
582
|
############################### Calculate First only
|
|
507
|
-
if numbers[0] == 1 and numbers[1] >=2:
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
elif numbers[0] >1 and numbers[1] >=2:
|
|
511
|
-
|
|
512
|
-
|
|
583
|
+
#if numbers[0] == 1 and numbers[1] >=2:
|
|
584
|
+
calc_First_only_core(cluster, numbers[1],groups,cores)
|
|
585
|
+
|
|
586
|
+
# elif numbers[0] >1 and numbers[1] >=2:
|
|
587
|
+
# calc_First_only_core(cluster, numbers[2][0],groups,cores)
|
|
588
|
+
|
|
513
589
|
|
|
514
590
|
if options.reclustered != None:
|
|
515
591
|
############################# Calculate First and Reclustered-Second
|
|
@@ -532,13 +608,13 @@ def cluster(options):
|
|
|
532
608
|
if data[1] >= 2:
|
|
533
609
|
calc_only_Second_only_core(groups, cores, data[1])
|
|
534
610
|
###########################
|
|
535
|
-
print("End")
|
|
536
611
|
key_order = ['first_core_', 'extended_core_', 'combined_core_', 'second_core_','only_second_core_']
|
|
537
|
-
print("Gene
|
|
612
|
+
print("Gene Groups:")
|
|
538
613
|
for key_prefix in key_order:
|
|
539
614
|
for key, value in cores.items():
|
|
540
615
|
if key.startswith(key_prefix):
|
|
541
616
|
print(f"{key}: {len(value)}")
|
|
617
|
+
print("Total Number of Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
|
|
542
618
|
|
|
543
619
|
if options.gene_presence_absence_out != None:
|
|
544
620
|
gene_presence_absence_output(options,genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted)
|
|
@@ -566,25 +642,107 @@ def cluster(options):
|
|
|
566
642
|
wrapped_sequence = wrap_sequence(sequences[header])
|
|
567
643
|
outfile.write(f"{wrapped_sequence}\n")
|
|
568
644
|
|
|
645
|
+
if options.con_core != None and options.fasta != None and options.write_families != None:
|
|
646
|
+
process_gene_families(os.path.join(input_dir, 'Gene_Families_Output'), 'concatonated_genes_aligned.fasta')
|
|
647
|
+
|
|
648
|
+
|
|
649
|
+
# groups_dir = os.path.join(input_dir, 'Gene_Families_Output')
|
|
650
|
+
# """Run mafft on all .fasta files in the given directory."""
|
|
651
|
+
# for filename in os.listdir(groups_dir):
|
|
652
|
+
# if filename.endswith('.fasta'):
|
|
653
|
+
# input_path = os.path.join(groups_dir, filename)
|
|
654
|
+
# output_filename = filename.replace('.fasta', '_mafft.aln')
|
|
655
|
+
# output_path = os.path.join(groups_dir, output_filename)
|
|
656
|
+
#
|
|
657
|
+
# # Call mafft command
|
|
658
|
+
# try:
|
|
659
|
+
# with open(output_path, 'w') as output_file:
|
|
660
|
+
# subprocess.run(
|
|
661
|
+
# ['mafft', '--auto', input_path],
|
|
662
|
+
# stdout=output_file,
|
|
663
|
+
# stderr=subprocess.DEVNULL, # Suppress stderr
|
|
664
|
+
# check=True
|
|
665
|
+
# )
|
|
666
|
+
# print(f"Processed {input_path} -> {output_path}")
|
|
667
|
+
# except subprocess.CalledProcessError as e:
|
|
668
|
+
# print(f"Failed to process {input_path}: {e}")
|
|
669
|
+
|
|
670
|
+
##This could be run once and not above AND here..
|
|
671
|
+
# output_dir = os.path.dirname(os.path.abspath(options.clusters))
|
|
672
|
+
# sequences = read_fasta(options.fasta)
|
|
673
|
+
# concatenated_sequences = {genome: '' for genome in genome_dict.keys()}
|
|
674
|
+
#
|
|
675
|
+
#
|
|
676
|
+
# for key_prefix in key_order:
|
|
677
|
+
# for key, values in cores.items():
|
|
678
|
+
# if any(part in options.con_core.split(',') for part in key.split('_')):
|
|
679
|
+
# if key.startswith(key_prefix):
|
|
680
|
+
# for value in values:
|
|
681
|
+
# length_capture = {genome: [] for genome in genome_dict.keys()}
|
|
682
|
+
# sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
|
|
683
|
+
# for header in sequences_to_write:
|
|
684
|
+
# if header in sequences:
|
|
685
|
+
# length_capture[header.split('|')[0]].append([header,len(sequences[header])])
|
|
686
|
+
# if all(bool(values) for values in length_capture.values()): # If a GF is not present in 'ALL' genomes, do not add to concat
|
|
687
|
+
# for genome, lengths in length_capture.items():
|
|
688
|
+
# max_value = float('-inf')
|
|
689
|
+
# max_item = None
|
|
690
|
+
# for length in lengths:
|
|
691
|
+
# current_value = length[1]
|
|
692
|
+
# if current_value > max_value:
|
|
693
|
+
# max_value = current_value
|
|
694
|
+
# max_item = length[0]
|
|
695
|
+
# concatenated_sequences[genome.split('|')[0]] += sequences[max_item]
|
|
696
|
+
#
|
|
697
|
+
#
|
|
698
|
+
# with open(os.path.join(output_dir, 'core_concat.fasta'), 'w') as outfile:
|
|
699
|
+
# for genome, sequence in concatenated_sequences.items():
|
|
700
|
+
# outfile.write(f">{genome}\n")
|
|
701
|
+
# wrapped_sequence = wrap_sequence(sequence)
|
|
702
|
+
# outfile.write(f"{wrapped_sequence}\n")
|
|
703
|
+
|
|
704
|
+
|
|
705
|
+
# for core_gene_family in core_gene_families:
|
|
706
|
+
# found_sequences = {genome: False for genome in genomes}
|
|
707
|
+
#
|
|
708
|
+
# for fasta_file in fasta_files:
|
|
709
|
+
# sequences = read_fasta(fasta_file)
|
|
710
|
+
# for header, sequence in sequences.items():
|
|
711
|
+
# genome = header.split('|')[0]
|
|
712
|
+
# if genome in genomes and core_gene_family in header:
|
|
713
|
+
# concatenated_sequences[genome] += sequence
|
|
714
|
+
# found_sequences[genome] = True
|
|
715
|
+
#
|
|
716
|
+
# for genome in genomes:
|
|
717
|
+
# if not found_sequences[genome]:
|
|
718
|
+
# concatenated_sequences[genome] += '-' * len(next(iter(sequences.values())))
|
|
719
|
+
|
|
720
|
+
|
|
569
721
|
|
|
570
722
|
|
|
571
723
|
def main():
|
|
572
724
|
|
|
573
|
-
parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': PyamilySeq Run Parameters.')
|
|
725
|
+
parser = argparse.ArgumentParser(description='PyamilySeq-Species ' + PyamilySeq_Version + ': PyamilySeq-Species Run Parameters.')
|
|
574
726
|
parser._action_groups.pop()
|
|
575
727
|
|
|
576
728
|
required = parser.add_argument_group('Required Arguments')
|
|
577
729
|
required.add_argument('-c', action='store', dest='clusters', help='Clustering output file from CD-HIT, TSV or CSV Edge List',
|
|
578
730
|
required=True)
|
|
579
731
|
required.add_argument('-f', action='store', dest='format', choices=['CD-HIT', 'CSV', 'TSV'],
|
|
580
|
-
help='Which format to use (CD-HIT or Comma/Tab Separated Edge-List (such as MMseqs2 tsv output))',
|
|
732
|
+
help='Which format to use (CD-HIT or Comma/Tab Separated Edge-List (such as MMseqs2 tsv output))',
|
|
733
|
+
required=True)
|
|
581
734
|
|
|
582
735
|
output_args = parser.add_argument_group('Output Parameters')
|
|
583
|
-
output_args.add_argument('-w', action="store", dest='write_families', default=
|
|
584
|
-
help='Default - No output: Output sequences of identified families (provide levels at which to output "-w 99
|
|
585
|
-
' - Must provide FASTA file with -fasta'
|
|
736
|
+
output_args.add_argument('-w', action="store", dest='write_families', default=None,
|
|
737
|
+
help='Default - No output: Output sequences of identified families (provide levels at which to output "-w 99,95"'
|
|
738
|
+
' - Must provide FASTA file with -fasta',
|
|
739
|
+
required=False)
|
|
740
|
+
output_args.add_argument('-con', action="store", dest='con_core', default=None,
|
|
741
|
+
help='Default - No output: Output aligned and concatinated sequences of identified families - used for MSA (provide levels at which to output "-w 99,95"'
|
|
742
|
+
' - Must provide FASTA file with -fasta',
|
|
743
|
+
required=False)
|
|
586
744
|
output_args.add_argument('-fasta', action='store', dest='fasta',
|
|
587
|
-
help='FASTA file to use in conjunction with "-w"',
|
|
745
|
+
help='FASTA file to use in conjunction with "-w" or "-con"',
|
|
588
746
|
required=False)
|
|
589
747
|
|
|
590
748
|
optional = parser.add_argument_group('Optional Arguments')
|
|
@@ -592,16 +750,18 @@ def main():
|
|
|
592
750
|
required=False)
|
|
593
751
|
optional.add_argument('-st', action='store', dest='sequence_tag', help='Default - "StORF": Unique identifier to be used to distinguish the second of two rounds of clustered sequences',
|
|
594
752
|
required=False)
|
|
595
|
-
optional.add_argument('-groups', action="store", dest='core_groups', default="99,95,
|
|
596
|
-
help='Default - (\'99,95,
|
|
753
|
+
optional.add_argument('-groups', action="store", dest='core_groups', default="99,95,15",
|
|
754
|
+
help='Default - (\'99,95,15\'): Gene family groups to use')
|
|
597
755
|
optional.add_argument('-gpa', action='store', dest='gene_presence_absence_out', help='Default - False: If selected, a Roary formatted gene_presence_absence.csv will be created - Required for Coinfinder and other downstream tools',
|
|
598
756
|
required=False)
|
|
599
757
|
|
|
600
758
|
misc = parser.add_argument_group('Misc')
|
|
601
759
|
misc.add_argument('-verbose', action='store', dest='verbose', default=False, type=eval, choices=[True, False],
|
|
602
|
-
help='Default - False: Print out runtime messages'
|
|
760
|
+
help='Default - False: Print out runtime messages',
|
|
761
|
+
required = False)
|
|
603
762
|
misc.add_argument('-v', action='store_true', dest='version',
|
|
604
|
-
help='Default - False: Print out version number and exit'
|
|
763
|
+
help='Default - False: Print out version number and exit',
|
|
764
|
+
required=False)
|
|
605
765
|
|
|
606
766
|
|
|
607
767
|
options = parser.parse_args()
|
|
@@ -614,6 +774,11 @@ def main():
|
|
|
614
774
|
if options.sequence_tag == None:
|
|
615
775
|
options.sequence_tag = 'StORF'
|
|
616
776
|
|
|
777
|
+
if options.con_core == True:
|
|
778
|
+
if is_tool_installed('mafft'):
|
|
779
|
+
print("mafft is installed. Proceeding with alignment.")
|
|
780
|
+
else:
|
|
781
|
+
print("mafft is not installed. Please install mafft to proceed.")
|
|
617
782
|
|
|
618
783
|
if options.write_families != None and options.fasta == False:
|
|
619
784
|
exit("-fasta must br provided if -w is used")
|
|
@@ -643,5 +808,5 @@ def main():
|
|
|
643
808
|
|
|
644
809
|
if __name__ == "__main__":
|
|
645
810
|
main()
|
|
646
|
-
print("
|
|
811
|
+
print("Done")
|
|
647
812
|
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: PyamilySeq
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Summary: PyamilySeq - A a tool to look for sequence-based gene families identified by clustering methods such as CD-HIT, DIAMOND, BLAST or MMseqs2.
|
|
5
|
+
Home-page: https://github.com/NickJD/PyamilySeq
|
|
6
|
+
Author: Nicholas Dimonaco
|
|
7
|
+
Author-email: nicholas@dimonaco.co.uk
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/NickJD/PyamilySeq/issues
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Requires-Python: >=3.6
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
|
|
16
|
+
# PyamilySeq - !BETA!
|
|
17
|
+
**PyamilySeq** (Family Seek) is a Python tool for clustering gene sequences into families based on sequence similarity identified by tools such as CD-HIT, BLAST, DIAMOND or MMseqs2.
|
|
18
|
+
This work is an extension of the gene family / pangenome tool developed for the StORF-Reporter publication in NAR (https://doi.org/10.1093/nar/gkad814).
|
|
19
|
+
|
|
20
|
+
## Features
|
|
21
|
+
- **End-to-End**: PyamilySeq can take a directory of GFF+FASTA files, run CD-HIT for clustering and process the results.
|
|
22
|
+
- **Clustering**: Supports input from CD-HIT formatted files as well as CSV and TSV edge lists (-outfmt 6 from BLAST/DIAMOND).
|
|
23
|
+
- **Reclustering**: Allows for the addition of new sequences post-initial clustering.
|
|
24
|
+
- **Output**: Generates a gene 'Roary/Panaroo' formatted presence-absence CSV formatted file for downstream analysis.
|
|
25
|
+
- Align representative sequences using MAFFT.
|
|
26
|
+
- Output concatenated aligned sequences for downstream analysis.
|
|
27
|
+
- Optionally output sequences of identified families.
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
### Installation
|
|
31
|
+
PyamilySeq requires Python 3.6 or higher. Install using pip:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install PyamilySeq
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Usage - Menu
|
|
38
|
+
```
|
|
39
|
+
usage: PyamilySeq.py [-h] -id INPUT_DIR -od OUTPUT_DIR -it {separate,combined} -ns NAME_SPLIT -pid PIDENT -ld LEN_DIFF -co CLUSTERING_OUT -ct {CD-HIT,BLAST,DIAMOND,MMseqs2} [-w WRITE_FAMILIES] [-con CON_CORE] [-fasta FASTA] [-rc RECLUSTERED] [-st SEQUENCE_TAG] [-groups CORE_GROUPS]
|
|
40
|
+
[-gpa GENE_PRESENCE_ABSENCE_OUT]
|
|
41
|
+
...
|
|
42
|
+
|
|
43
|
+
PyamilySeq v0.4.0: PyamilySeq Run Parameters.
|
|
44
|
+
|
|
45
|
+
positional arguments:
|
|
46
|
+
pyamilyseq_args Additional arguments for PyamilySeq.
|
|
47
|
+
|
|
48
|
+
options:
|
|
49
|
+
-h, --help show this help message and exit
|
|
50
|
+
|
|
51
|
+
Required Arguments:
|
|
52
|
+
-id INPUT_DIR Directory containing GFF/FASTA files.
|
|
53
|
+
-od OUTPUT_DIR Directory for all output files.
|
|
54
|
+
-it {separate,combined}
|
|
55
|
+
Type of input files: 'separate' for separate FASTA and GFF files, 'combined' for GFF files with embedded FASTA sequences.
|
|
56
|
+
-ns NAME_SPLIT Character used to split the filename and extract the genome name.
|
|
57
|
+
-pid PIDENT Pident threshold for CD-HIT clustering.
|
|
58
|
+
-ld LEN_DIFF Length difference (-s) threshold for CD-HIT clustering.
|
|
59
|
+
-co CLUSTERING_OUT Output file for initial clustering.
|
|
60
|
+
-ct {CD-HIT,BLAST,DIAMOND,MMseqs2}
|
|
61
|
+
Clustering format for PyamilySeq.
|
|
62
|
+
|
|
63
|
+
Output Parameters:
|
|
64
|
+
-w WRITE_FAMILIES Default - No output: Output sequences of identified families (provide levels at which to output "-w 99,95" - Must provide FASTA file with -fasta
|
|
65
|
+
-con CON_CORE Default - No output: Output aligned and concatinated sequences of identified families - used for MSA (provide levels at which to output "-w 99,95" - Must provide FASTA file with -fasta
|
|
66
|
+
-fasta FASTA FASTA file to use in conjunction with "-w" or "-con"
|
|
67
|
+
|
|
68
|
+
Optional Arguments:
|
|
69
|
+
-rc RECLUSTERED Clustering output file from secondary round of clustering
|
|
70
|
+
-st SEQUENCE_TAG Default - "StORF": Unique identifier to be used to distinguish the second of two rounds of clustered sequences
|
|
71
|
+
-groups CORE_GROUPS Default - ('99,95,15'): Gene family groups to use
|
|
72
|
+
-gpa GENE_PRESENCE_ABSENCE_OUT
|
|
73
|
+
Default - False: If selected, a Roary formatted gene_presence_absence.csv will be created - Required for Coinfinder and other downstream tools
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### Example Run End-to-End - 'genomes' is a test-directory containing GFF files with ##FASTA at the bottom
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
PyamilySeq -id .../genomes -it combined -ns _combined.gff3 -pid 0.90 -ld 0.60 -co testing_cd-hit -ct CD-HIT -od .../testing
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
```Calculating Groups
|
|
83
|
+
Calculating Groups
|
|
84
|
+
Gene Groups:
|
|
85
|
+
first_core_99: 3103
|
|
86
|
+
first_core_95: 0
|
|
87
|
+
first_core_15: 3217
|
|
88
|
+
first_core_0: 4808
|
|
89
|
+
Total Number of Gene Groups (Including Singletons): 11128
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
PyamilySeq/CD-Hit_StORF-Reporter_Cross-Genera_Builder.py,sha256=UzQ5iOKCNfurxmj1pnkowF11YfWBO5vnBCKxQK6goB8,26538
|
|
2
|
+
PyamilySeq/Constants.py,sha256=971sO5fjptv27yRtg595ex8VuNURb2Nh4mFSdGx6HJ4,399
|
|
3
|
+
PyamilySeq/PyamilySeq.py,sha256=Zy84pSBXY9EnMmk30SrfbQr9-SWYJ4rPHb9xbV3L9lU,8971
|
|
4
|
+
PyamilySeq/PyamilySeq_Species.py,sha256=kTXeCgplHfCglii_g099zdt2iy0lc5wDX3k4HuSaIgo,39167
|
|
5
|
+
PyamilySeq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
PyamilySeq/combine_FASTA_with_genome_IDs.py,sha256=aMUVSk6jKnKX0g04RMM360QueZS83lRLqLLysBtQbLo,2009
|
|
7
|
+
PyamilySeq-0.4.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
8
|
+
PyamilySeq-0.4.0.dist-info/METADATA,sha256=d0goQEGZZz_q6_sZUwoPr-h7FR-Ad7WmupIJuK8MTFc,4462
|
|
9
|
+
PyamilySeq-0.4.0.dist-info/WHEEL,sha256=rWxmBtp7hEUqVLOnTaDOPpR-cZpCDkzhhcBce-Zyd5k,91
|
|
10
|
+
PyamilySeq-0.4.0.dist-info/entry_points.txt,sha256=aEpNchWXaSR7_hGQqXYGtvXz14FgIcfFdXESpEhsvXg,58
|
|
11
|
+
PyamilySeq-0.4.0.dist-info/top_level.txt,sha256=J6JhugUQTq4rq96yibAlQu3o4KCM9WuYfqr3w1r119M,11
|
|
12
|
+
PyamilySeq-0.4.0.dist-info/RECORD,,
|
|
@@ -1,101 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.1
|
|
2
|
-
Name: PyamilySeq
|
|
3
|
-
Version: 0.2.0
|
|
4
|
-
Summary: PyamilySeq - A a tool to look for sequence-based gene families identified by clustering methods such as CD-HIT, DIAMOND, BLAST or MMseqs2.
|
|
5
|
-
Home-page: https://github.com/NickJD/PyamilySeq
|
|
6
|
-
Author: Nicholas Dimonaco
|
|
7
|
-
Author-email: nicholas@dimonaco.co.uk
|
|
8
|
-
Project-URL: Bug Tracker, https://github.com/NickJD/PyamilySeq/issues
|
|
9
|
-
Classifier: Programming Language :: Python :: 3
|
|
10
|
-
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
11
|
-
Classifier: Operating System :: OS Independent
|
|
12
|
-
Requires-Python: >=3.6
|
|
13
|
-
Description-Content-Type: text/markdown
|
|
14
|
-
License-File: LICENSE
|
|
15
|
-
|
|
16
|
-
# PyamilySeq
|
|
17
|
-
PyamilySeq (Family Seek) is a Python tool for clustering gene sequences into families based on sequence similarity identified by tools such as CD-HIT, DIAMOND or MMseqs2.
|
|
18
|
-
This work is an extension of the gene family / pangenome tool developed for the StORF-Reporter publication in NAR (https://doi.org/10.1093/nar/gkad814).
|
|
19
|
-
|
|
20
|
-
## Features
|
|
21
|
-
|
|
22
|
-
- **Clustering**: Supports input from CD-HIT formatted files as well as TSV and CSV Edge List formats.
|
|
23
|
-
- **Reclustering**: Allows for the addition of new sequences post-initial clustering.
|
|
24
|
-
- **Output**: Generates a gene 'Roary' presence-absence CSV formatted file for downstream analysis.
|
|
25
|
-
|
|
26
|
-
## Installation
|
|
27
|
-
|
|
28
|
-
PyamilySeq requires Python 3.6 or higher. Install dependencies using pip:
|
|
29
|
-
|
|
30
|
-
```bash
|
|
31
|
-
pip install PyamilySeq
|
|
32
|
-
```
|
|
33
|
-
|
|
34
|
-
## Usage - Menu
|
|
35
|
-
```
|
|
36
|
-
PyamilySeq_Species.py -h
|
|
37
|
-
usage: PyamilySeq_Species.py [-h] -c CLUSTERS -f {CD-HIT,CSV,TSV} [-w WRITE_FAMILIES] [-fasta FASTA] [-rc RECLUSTERED] [-st SEQUENCE_TAG]
|
|
38
|
-
[-groups CORE_GROUPS] [-gpa GENE_PRESENCE_ABSENCE_OUT] [-verbose {True,False}] [-v]
|
|
39
|
-
|
|
40
|
-
PyamilySeq v0.2.0: PyamilySeq Run Parameters.
|
|
41
|
-
|
|
42
|
-
Required Arguments:
|
|
43
|
-
-c CLUSTERS Clustering output file from CD-HIT, TSV or CSV Edge List
|
|
44
|
-
-f {CD-HIT,CSV,TSV} Which format to use (CD-HIT or Comma/Tab Separated Edge-List (such as MMseqs2 tsv output))
|
|
45
|
-
|
|
46
|
-
Output Parameters:
|
|
47
|
-
-w WRITE_FAMILIES Default - No output: Output sequences of identified families (provide levels at which to output "-w 99 95" - Must provide
|
|
48
|
-
FASTA file with -fasta
|
|
49
|
-
-fasta FASTA FASTA file to use in conjunction with "-w"
|
|
50
|
-
|
|
51
|
-
Optional Arguments:
|
|
52
|
-
-rc RECLUSTERED Clustering output file from secondary round of clustering
|
|
53
|
-
-st SEQUENCE_TAG Default - "StORF": Unique identifier to be used to distinguish the second of two rounds of clustered sequences
|
|
54
|
-
-groups CORE_GROUPS Default - ('99,95,90,80,15'): Gene family groups to use
|
|
55
|
-
-gpa GENE_PRESENCE_ABSENCE_OUT
|
|
56
|
-
Default - False: If selected, a Roary formatted gene_presence_absence.csv will be created - Required for Coinfinder and other
|
|
57
|
-
downstream tools
|
|
58
|
-
|
|
59
|
-
Misc:
|
|
60
|
-
-verbose {True,False}
|
|
61
|
-
Default - False: Print out runtime messages
|
|
62
|
-
-v Default - False: Print out version number and exit
|
|
63
|
-
|
|
64
|
-
```
|
|
65
|
-
|
|
66
|
-
### Clustering Analysis
|
|
67
|
-
|
|
68
|
-
To perform clustering analysis:
|
|
69
|
-
|
|
70
|
-
```bash
|
|
71
|
-
python pyamilyseq.py -c clusters_file -f format
|
|
72
|
-
```
|
|
73
|
-
|
|
74
|
-
Replace `clusters_file` with the path to your clustering output file and `format` with one of: `CD-HIT`, `CSV`, or `TSV`.
|
|
75
|
-
|
|
76
|
-
### Reclustering
|
|
77
|
-
|
|
78
|
-
To add new sequences and recluster:
|
|
79
|
-
|
|
80
|
-
```bash
|
|
81
|
-
PyamilySeq -c clusters_file -f format --reclustered reclustered_file
|
|
82
|
-
```
|
|
83
|
-
|
|
84
|
-
Replace `reclustered_file` with the path to the file containing additional sequences.
|
|
85
|
-
|
|
86
|
-
## Output
|
|
87
|
-
|
|
88
|
-
PyamilySeq generates various outputs, including:
|
|
89
|
-
|
|
90
|
-
- **Gene Presence-Absence File**: This CSV file details the presence and absence of genes across genomes.
|
|
91
|
-
- **FASTA Files for Each Gene Family**:
|
|
92
|
-
|
|
93
|
-
## Gene Family Groups
|
|
94
|
-
|
|
95
|
-
After analysis, PyamilySeq categorizes gene families into several groups:
|
|
96
|
-
|
|
97
|
-
- **First Core**: Gene families present in all analysed genomes initially.
|
|
98
|
-
- **Extended Core**: Gene families extended with additional sequences.
|
|
99
|
-
- **Combined Core**: Gene families combined with both initial and additional sequences.
|
|
100
|
-
- **Second Core**: Gene families identified only in the additional sequences.
|
|
101
|
-
- **Only Second Core**: Gene families exclusively found in the additional sequences.
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
PyamilySeq/CD-Hit_StORF-Reporter_Cross-Genera_Builder.py,sha256=UzQ5iOKCNfurxmj1pnkowF11YfWBO5vnBCKxQK6goB8,26538
|
|
2
|
-
PyamilySeq/Constants.py,sha256=3Nr6JfUVt2eZT4M7fV-sz_bPXIvPgxIBT5nR76kCPIo,30
|
|
3
|
-
PyamilySeq/PyamilySeq_Species.py,sha256=SCWeK7bEfnKLrfzliiOx7Jtmie8vvAXGtQE_PpJD5hY,31040
|
|
4
|
-
PyamilySeq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
-
PyamilySeq/combine_FASTA_with_genome_IDs.py,sha256=aMUVSk6jKnKX0g04RMM360QueZS83lRLqLLysBtQbLo,2009
|
|
6
|
-
PyamilySeq-0.2.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
7
|
-
PyamilySeq-0.2.0.dist-info/METADATA,sha256=FUiZzxQzqnOwokb7MflZCMUzK9JgFVUVzEvLBPAlpgk,4144
|
|
8
|
-
PyamilySeq-0.2.0.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
|
|
9
|
-
PyamilySeq-0.2.0.dist-info/entry_points.txt,sha256=zGtA2Ycf0LG3PR7zuuT0wjaAKLFxtyGgBc0O_W7E250,66
|
|
10
|
-
PyamilySeq-0.2.0.dist-info/top_level.txt,sha256=J6JhugUQTq4rq96yibAlQu3o4KCM9WuYfqr3w1r119M,11
|
|
11
|
-
PyamilySeq-0.2.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|