PyamilySeq 0.9.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyamilySeq/Cluster_Summary.py +2 -2
- PyamilySeq/Group_Splitter.py +247 -58
- PyamilySeq/PyamilySeq.py +168 -148
- PyamilySeq/PyamilySeq_Genus.py +11 -11
- PyamilySeq/PyamilySeq_Species.py +51 -29
- PyamilySeq/Seq_Combiner.py +6 -7
- PyamilySeq/Seq_Extractor.py +64 -0
- PyamilySeq/Seq_Finder.py +56 -0
- PyamilySeq/clusterings.py +139 -49
- PyamilySeq/constants.py +2 -0
- PyamilySeq/utils.py +214 -56
- {PyamilySeq-0.9.0.dist-info → PyamilySeq-1.0.1.dist-info}/METADATA +174 -138
- PyamilySeq-1.0.1.dist-info/RECORD +18 -0
- {PyamilySeq-0.9.0.dist-info → PyamilySeq-1.0.1.dist-info}/WHEEL +1 -1
- {PyamilySeq-0.9.0.dist-info → PyamilySeq-1.0.1.dist-info}/entry_points.txt +2 -0
- PyamilySeq/Constants.py +0 -2
- PyamilySeq-0.9.0.dist-info/RECORD +0 -16
- {PyamilySeq-0.9.0.dist-info → PyamilySeq-1.0.1.dist-info}/LICENSE +0 -0
- {PyamilySeq-0.9.0.dist-info → PyamilySeq-1.0.1.dist-info}/top_level.txt +0 -0
PyamilySeq/PyamilySeq_Species.py
CHANGED
|
@@ -1,32 +1,27 @@
|
|
|
1
|
-
#from line_profiler_pycharm import profile
|
|
2
|
-
|
|
3
|
-
import math
|
|
4
1
|
|
|
5
2
|
try:
|
|
6
|
-
from .
|
|
3
|
+
from .constants import *
|
|
7
4
|
from .clusterings import *
|
|
8
5
|
from .utils import *
|
|
9
6
|
except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
|
|
10
|
-
from
|
|
7
|
+
from constants import *
|
|
11
8
|
from clusterings import *
|
|
12
9
|
from utils import *
|
|
13
10
|
|
|
14
11
|
|
|
15
|
-
#def output_fasta(options, gene_families):
|
|
16
|
-
|
|
17
12
|
def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
|
|
18
13
|
print("Outputting gene_presence_absence file")
|
|
19
14
|
output_dir = os.path.abspath(options.output_dir)
|
|
20
|
-
in_name = options.clusters.split('.')[0].split('/')[-1]
|
|
21
|
-
gpa_outfile = os.path.join(output_dir,
|
|
22
|
-
gpa_outfile = open(gpa_outfile
|
|
15
|
+
#in_name = options.clusters.split('.')[0].split('/')[-1]
|
|
16
|
+
gpa_outfile = os.path.join(output_dir, 'gene_presence_absence.csv')
|
|
17
|
+
gpa_outfile = open(gpa_outfile, 'w')
|
|
23
18
|
gpa_outfile.write('"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","'
|
|
24
19
|
'"Accessory Fragment","Accessory Order with Fragment","QC","Min group size nuc","Max group size nuc","Avg group size nuc","')
|
|
25
20
|
gpa_outfile.write('","'.join(genome_dict.keys()))
|
|
26
21
|
gpa_outfile.write('"\n')
|
|
27
22
|
for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
|
|
28
23
|
average_sequences_per_genome = len(sequences) / len(pangenome_clusters_First_sorted[cluster])
|
|
29
|
-
gpa_outfile.write('"group_'+str(cluster)+'","","'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
|
|
24
|
+
gpa_outfile.write('"group_'+str(cluster)+'","","",'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
|
|
30
25
|
'","","","","","","","","",""')
|
|
31
26
|
|
|
32
27
|
|
|
@@ -35,9 +30,9 @@ def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_
|
|
|
35
30
|
tmp_list = []
|
|
36
31
|
for value in sequences:
|
|
37
32
|
if value.split('|')[0] == genome:
|
|
38
|
-
tmp_list.append(value)
|
|
33
|
+
tmp_list.append(value.split('|')[1])
|
|
39
34
|
if tmp_list:
|
|
40
|
-
full_out += ',"'+''.join(tmp_list)+'"'
|
|
35
|
+
full_out += ',"'+'\t'.join(tmp_list)+'"'
|
|
41
36
|
else:
|
|
42
37
|
full_out = ',""'
|
|
43
38
|
gpa_outfile.write(full_out)
|
|
@@ -64,7 +59,7 @@ def get_cores(options,genome_dict):
|
|
|
64
59
|
cores = OrderedDict()
|
|
65
60
|
prev_top = len(genome_dict)
|
|
66
61
|
first = True
|
|
67
|
-
for group in options.
|
|
62
|
+
for group in options.species_groups.split(','):
|
|
68
63
|
calculated_floor = math.floor(int(group) / 100 * len(genome_dict))
|
|
69
64
|
if first == False:
|
|
70
65
|
groups[group] = (calculated_floor,prev_top)
|
|
@@ -138,14 +133,16 @@ def cluster(options):
|
|
|
138
133
|
|
|
139
134
|
if options.cluster_format == 'CD-HIT':
|
|
140
135
|
genome_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options, '|')
|
|
141
|
-
elif '
|
|
142
|
-
genome_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps =
|
|
136
|
+
elif 'BLAST' in options.cluster_format:
|
|
137
|
+
genome_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_BLAST(options, '|')
|
|
138
|
+
elif 'MMseqs' in options.cluster_format:
|
|
139
|
+
genome_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_MMseqs(options, '|')
|
|
143
140
|
|
|
144
141
|
###
|
|
145
142
|
cores, groups = get_cores(options, genome_dict)
|
|
146
143
|
###
|
|
147
144
|
|
|
148
|
-
if options.reclustered != None:
|
|
145
|
+
if options.reclustered != None: #FIX
|
|
149
146
|
if options.cluster_format == 'CD-HIT':
|
|
150
147
|
combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences = combined_clustering_CDHIT(options, genome_dict, '|')
|
|
151
148
|
elif 'TSV' in options.cluster_format or 'CSV' in options.cluster_format:
|
|
@@ -169,8 +166,6 @@ def cluster(options):
|
|
|
169
166
|
for cluster, numbers in pangenome_clusters_Type_sorted.items():
|
|
170
167
|
############################### Calculate First only
|
|
171
168
|
cluster = str(cluster)
|
|
172
|
-
if '78' in cluster:
|
|
173
|
-
pass
|
|
174
169
|
for grouping in numbers[2]: #!!# Could do with a more elegant solution
|
|
175
170
|
current_cluster = grouping[0].split(':')[0]
|
|
176
171
|
if current_cluster not in seen_groupings:
|
|
@@ -210,8 +205,10 @@ def cluster(options):
|
|
|
210
205
|
stats_out = os.path.join(output_path,'summary_statistics.txt')
|
|
211
206
|
key_order = ['First_core_', 'extended_core_', 'combined_core_', 'Second_core_','only_Second_core_']
|
|
212
207
|
with open(stats_out, 'w') as outfile:
|
|
208
|
+
print("Number of Genomes: " + str(len(genome_dict)))
|
|
209
|
+
outfile.write("Number of Genomes: " + str(len(genome_dict)) + "\n")
|
|
213
210
|
print("Gene Groups:")
|
|
214
|
-
outfile.write("Gene Groups
|
|
211
|
+
outfile.write("Gene Groups\n")
|
|
215
212
|
for key_prefix in key_order:
|
|
216
213
|
for key, value in cores.items():
|
|
217
214
|
if key.startswith(key_prefix):
|
|
@@ -236,34 +233,59 @@ def cluster(options):
|
|
|
236
233
|
###Need to fix this below. If full/partial the ifs need to be different. If full we first need to output the gfs then align. if -wruite-groups not presented then it needs
|
|
237
234
|
# to be done for alignment full anyway...
|
|
238
235
|
|
|
236
|
+
genome_list = list(genome_dict.keys())
|
|
239
237
|
if options.run_mode == 'Full':
|
|
238
|
+
sequences = read_fasta(options.fasta)
|
|
240
239
|
if options.reclustered == None:
|
|
241
240
|
combined_pangenome_clusters_Second_sequences = None
|
|
241
|
+
## Output representative sequences
|
|
242
|
+
representatives_out = os.path.join(output_path,'pan_genome_reference.fa')
|
|
243
|
+
with open(representatives_out, 'w') as outfile:
|
|
244
|
+
for cluster, ids in pangenome_clusters_First_sequences.items():
|
|
245
|
+
outfile.write('>group_'+str(cluster)+'\n')
|
|
246
|
+
wrapped_aa_seq = wrap_sequence(sequences[ids[0]], 60)
|
|
247
|
+
outfile.write(wrapped_aa_seq+'\n')
|
|
242
248
|
if options.write_groups != None:
|
|
243
249
|
print("Outputting gene group FASTA files")
|
|
244
|
-
sequences = read_fasta(options.fasta)
|
|
245
250
|
#output_dir = os.path.dirname(os.path.abspath(options.output_dir))
|
|
246
|
-
output_dir = os.path.join(options.output_dir, '
|
|
247
|
-
|
|
251
|
+
output_dir = os.path.join(options.output_dir, 'Gene_Groups_Output')
|
|
252
|
+
write_groups_func(options,output_dir, key_order, cores, sequences,
|
|
248
253
|
pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
|
|
249
254
|
|
|
250
255
|
if options.align_core != None:
|
|
251
256
|
print("Processing gene group alignment")
|
|
252
|
-
|
|
257
|
+
process_gene_groups(options, output_dir, None, None, genome_list, 'core_gene_alignment.aln')
|
|
253
258
|
|
|
254
259
|
elif options.run_mode == 'Partial':
|
|
260
|
+
sequences = read_fasta(options.fasta)
|
|
255
261
|
if options.reclustered == None:
|
|
256
262
|
combined_pangenome_clusters_Second_sequences = None
|
|
257
|
-
|
|
263
|
+
# else: ## Output representative sequences - Under development
|
|
264
|
+
# representatives_out = os.path.join(output_path, 'pan_genome_reference_reclustered.fa')
|
|
265
|
+
# with open(representatives_out, 'w') as outfile:
|
|
266
|
+
# for cluster, ids in combined_pangenome_clusters_Second_sequences.items():
|
|
267
|
+
# outfile.write('>group_' + str(cluster) + '\n')
|
|
268
|
+
# try:
|
|
269
|
+
# wrapped_aa_seq = wrap_sequence(sequences[ids[0]], 60)
|
|
270
|
+
# except:
|
|
271
|
+
# print(2)
|
|
272
|
+
# outfile.write(wrapped_aa_seq + '\n')
|
|
273
|
+
## Output representative sequences
|
|
274
|
+
representatives_out = os.path.join(output_path,'pan_genome_reference.fa')
|
|
275
|
+
with open(representatives_out, 'w') as outfile:
|
|
276
|
+
for cluster, ids in pangenome_clusters_First_sequences.items():
|
|
277
|
+
outfile.write('>group_'+str(cluster)+'\n')
|
|
278
|
+
wrapped_aa_seq = wrap_sequence(sequences[ids[0]], 60)
|
|
279
|
+
outfile.write(wrapped_aa_seq+'\n')
|
|
280
|
+
if options.write_groups != None:
|
|
258
281
|
print("Outputting gene group FASTA files")
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
write_groups(options,output_dir, key_order, cores, sequences,
|
|
282
|
+
output_dir = os.path.join(options.output_dir, 'Gene_Groups_Output')
|
|
283
|
+
write_groups_func(options,output_dir, key_order, cores, sequences,
|
|
262
284
|
pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
|
|
263
285
|
|
|
264
286
|
if options.align_core != None:
|
|
265
287
|
print("Processing gene group alignment")
|
|
266
|
-
|
|
288
|
+
process_gene_groups(options, output_dir, None, None, genome_list, 'core_gene_alignment.aln')
|
|
267
289
|
|
|
268
290
|
|
|
269
291
|
|
PyamilySeq/Seq_Combiner.py
CHANGED
|
@@ -2,10 +2,10 @@ import argparse
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
try:
|
|
5
|
-
from .
|
|
5
|
+
from .constants import *
|
|
6
6
|
from .utils import *
|
|
7
7
|
except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
|
|
8
|
-
from
|
|
8
|
+
from constants import *
|
|
9
9
|
from utils import *
|
|
10
10
|
|
|
11
11
|
|
|
@@ -29,7 +29,7 @@ def main():
|
|
|
29
29
|
help="Directory for all output files.",
|
|
30
30
|
required=True)
|
|
31
31
|
required.add_argument("-output_name", action="store", dest="output_file",
|
|
32
|
-
help="Output file name.",
|
|
32
|
+
help="Output file name (without .fasta).",
|
|
33
33
|
required=True)
|
|
34
34
|
|
|
35
35
|
optional = parser.add_argument_group('Optional Arguments')
|
|
@@ -38,7 +38,7 @@ def main():
|
|
|
38
38
|
' - Not compatible with "fasta" input mode.',
|
|
39
39
|
required=False)
|
|
40
40
|
optional.add_argument('-translate', action='store_true', dest='translate', default=None,
|
|
41
|
-
help='Default - False: Translate extracted sequences to their AA counterpart?',
|
|
41
|
+
help='Default - False: Translate extracted sequences to their AA counterpart? - appends _aa.fasta to given output_name',
|
|
42
42
|
required=False)
|
|
43
43
|
misc = parser.add_argument_group('Misc Arguments')
|
|
44
44
|
misc.add_argument("-v", "--version", action="version",
|
|
@@ -47,14 +47,13 @@ def main():
|
|
|
47
47
|
|
|
48
48
|
options = parser.parse_args()
|
|
49
49
|
|
|
50
|
-
|
|
51
|
-
sys.exit(PyamilySeq_Version)
|
|
50
|
+
|
|
52
51
|
|
|
53
52
|
output_path = os.path.abspath(options.output_dir)
|
|
54
53
|
if not os.path.exists(output_path):
|
|
55
54
|
os.makedirs(output_path)
|
|
56
55
|
|
|
57
|
-
combined_out_file = os.path.join(output_path, options.output_file)
|
|
56
|
+
combined_out_file = os.path.join(output_path, options.output_file + '.fasta')
|
|
58
57
|
|
|
59
58
|
if options.input_type == 'separate':
|
|
60
59
|
read_separate_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, options.translate)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import copy
|
|
3
|
+
|
|
4
|
+
def find_gene_ids_in_csv(csv_file, group_name):
|
|
5
|
+
"""Find gene IDs associated with the specified group name in the CSV file, starting from column 14."""
|
|
6
|
+
gene_ids = []
|
|
7
|
+
with open(csv_file, 'r') as f:
|
|
8
|
+
for line in f:
|
|
9
|
+
cells = line.strip().split(',')
|
|
10
|
+
if cells[0].replace('"','') == group_name:
|
|
11
|
+
# Collect gene IDs from column 14 onward
|
|
12
|
+
for cell in cells[14:]:
|
|
13
|
+
gene_ids.extend(cell.strip().replace('"','').split()) # Splitting by spaces if there are multiple IDs in a cell break
|
|
14
|
+
return gene_ids
|
|
15
|
+
|
|
16
|
+
def extract_sequences(fasta_file, gene_ids):
|
|
17
|
+
"""Extract sequences from the FASTA file that match the gene IDs."""
|
|
18
|
+
sequences = {}
|
|
19
|
+
capture = False
|
|
20
|
+
current_id = ""
|
|
21
|
+
not_found = copy.deepcopy(gene_ids)
|
|
22
|
+
with open(fasta_file, 'r') as f:
|
|
23
|
+
for line in f:
|
|
24
|
+
if line.startswith('>'):
|
|
25
|
+
# Extract the ID part after '>' and check if it's in gene_ids
|
|
26
|
+
current_id = line[1:].strip().split()[0].split('|')[1]
|
|
27
|
+
capture = current_id in gene_ids
|
|
28
|
+
if current_id in not_found:
|
|
29
|
+
not_found.remove(current_id)
|
|
30
|
+
if capture:
|
|
31
|
+
sequences[current_id] = [line.strip()] # Start with header line
|
|
32
|
+
elif capture:
|
|
33
|
+
sequences[current_id].append(line.strip()) # Append sequence lines
|
|
34
|
+
return sequences
|
|
35
|
+
|
|
36
|
+
def main():
|
|
37
|
+
parser = argparse.ArgumentParser(description="Extract sequences for specified group name from CSV file and corresponding FASTA file.")
|
|
38
|
+
parser.add_argument("-csv", action='store', dest='csv_file',
|
|
39
|
+
help="CSV file containing group data", required=True)
|
|
40
|
+
parser.add_argument("-group", action='store', dest='group_name',
|
|
41
|
+
help="Group name to search for in the CSV", required=True)
|
|
42
|
+
parser.add_argument("-fasta", action='store', dest='fasta_file',
|
|
43
|
+
help="Input FASTA file containing sequences", required=True)
|
|
44
|
+
parser.add_argument("-out", action='store', dest='output_file',
|
|
45
|
+
help="Output FASTA file with extracted sequences", required=True)
|
|
46
|
+
|
|
47
|
+
options = parser.parse_args()
|
|
48
|
+
|
|
49
|
+
# Find gene IDs in CSV
|
|
50
|
+
gene_ids = find_gene_ids_in_csv(options.csv_file, options.group_name)
|
|
51
|
+
if not gene_ids:
|
|
52
|
+
print(f"No gene IDs found for group name '{options.group_name}' in the CSV.")
|
|
53
|
+
return
|
|
54
|
+
|
|
55
|
+
# Extract sequences from the FASTA file
|
|
56
|
+
sequences = extract_sequences(options.fasta_file, gene_ids)
|
|
57
|
+
|
|
58
|
+
# Write matched sequences to the output FASTA file
|
|
59
|
+
with open(options.output_file, 'w') as output:
|
|
60
|
+
for gene_id, sequence_lines in sequences.items():
|
|
61
|
+
output.write("\n".join(sequence_lines) + "\n")
|
|
62
|
+
|
|
63
|
+
if __name__ == "__main__":
|
|
64
|
+
main()
|
PyamilySeq/Seq_Finder.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import collections
|
|
3
|
+
import csv
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def parse_fasta_ids(fasta_file):
|
|
7
|
+
"""Extract IDs from the FASTA file."""
|
|
8
|
+
ids = []
|
|
9
|
+
with open(fasta_file, 'r') as f:
|
|
10
|
+
for line in f:
|
|
11
|
+
if line.startswith('>'):
|
|
12
|
+
seq_id = line[1:].strip().split()[0] # Capture the ID after '>'
|
|
13
|
+
ids.append(seq_id)
|
|
14
|
+
return ids
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def find_ids_in_csv(ids, csv_file):
|
|
18
|
+
"""Search for each ID in the CSV file and report the first column where it is found."""
|
|
19
|
+
found_records = collections.defaultdict(list)
|
|
20
|
+
with open(csv_file, 'r') as f:
|
|
21
|
+
csv_reader = csv.reader(f)
|
|
22
|
+
for row in csv_reader:
|
|
23
|
+
if row: # Ensure row is not empty
|
|
24
|
+
|
|
25
|
+
for id in ids: # slow
|
|
26
|
+
if id in row:
|
|
27
|
+
found_records[row[0]].append(id)
|
|
28
|
+
return found_records
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def main():
|
|
32
|
+
parser = argparse.ArgumentParser(description="Extract IDs from a FASTA file and search for them in a CSV file.")
|
|
33
|
+
parser.add_argument("-in", action='store', dest='fasta_file',
|
|
34
|
+
help="Input FASTA file", required=True)
|
|
35
|
+
parser.add_argument("-ids", action='store', dest='csv_file',
|
|
36
|
+
help="CSV file containing IDs to search for", required=True)
|
|
37
|
+
parser.add_argument("-out", action='store', dest='output_file',
|
|
38
|
+
help="Output file to save found IDs", required=True)
|
|
39
|
+
|
|
40
|
+
options = parser.parse_args()
|
|
41
|
+
|
|
42
|
+
# Parse IDs from the FASTA file
|
|
43
|
+
ids = parse_fasta_ids(options.fasta_file)
|
|
44
|
+
|
|
45
|
+
# Find IDs in the CSV file
|
|
46
|
+
found_records = find_ids_in_csv(ids, options.csv_file)
|
|
47
|
+
|
|
48
|
+
# Write output
|
|
49
|
+
with open(options.output_file, 'w') as output:
|
|
50
|
+
output.write("ID,Found_In_First_Column\n")
|
|
51
|
+
for seq_id, found_in in found_records.items():
|
|
52
|
+
output.write(f"{seq_id},{found_in}\n")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
if __name__ == "__main__":
|
|
56
|
+
main()
|
PyamilySeq/clusterings.py
CHANGED
|
@@ -52,6 +52,107 @@ def cluster_CDHIT(options, splitter):
|
|
|
52
52
|
|
|
53
53
|
return taxa_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps
|
|
54
54
|
|
|
55
|
+
def cluster_BLAST(options, splitter):
|
|
56
|
+
separator = '\t'
|
|
57
|
+
First_in = open(options.clusters, 'r')
|
|
58
|
+
pangenome_clusters_First = OrderedDict()
|
|
59
|
+
pangenome_clusters_First_genomes = defaultdict(list)
|
|
60
|
+
pangenome_clusters_First_sequences = defaultdict(list)
|
|
61
|
+
taxa_dict = defaultdict(int)
|
|
62
|
+
reps = OrderedDict()
|
|
63
|
+
edges = defaultdict(list)
|
|
64
|
+
for line in First_in:
|
|
65
|
+
elements = line.strip().split(separator)
|
|
66
|
+
rep, child = elements[0], elements[1]
|
|
67
|
+
child_taxa = child.split(splitter)[0] # Extracting the genome identifier from the child sequence
|
|
68
|
+
# Counting occurrences of genomes
|
|
69
|
+
taxa_dict[child_taxa] += 1
|
|
70
|
+
edges[rep].append(child)
|
|
71
|
+
edges[child].append(rep)
|
|
72
|
+
|
|
73
|
+
visited = set()
|
|
74
|
+
cluster_id = 0
|
|
75
|
+
|
|
76
|
+
def dfs(node, cluster_id):
|
|
77
|
+
stack = [node]
|
|
78
|
+
tmp_genomes = []
|
|
79
|
+
while stack:
|
|
80
|
+
current = stack.pop()
|
|
81
|
+
if current not in visited:
|
|
82
|
+
visited.add(current)
|
|
83
|
+
clustered_taxa = current.split(splitter)[0]
|
|
84
|
+
pangenome_clusters_First_sequences[cluster_id].append(current)
|
|
85
|
+
if clustered_taxa not in pangenome_clusters_First[cluster_id]:
|
|
86
|
+
pangenome_clusters_First[cluster_id].append(clustered_taxa)
|
|
87
|
+
tmp_genomes.append(clustered_taxa)
|
|
88
|
+
for neighbor in edges[current]:
|
|
89
|
+
if neighbor not in visited:
|
|
90
|
+
stack.append(neighbor)
|
|
91
|
+
|
|
92
|
+
pangenome_clusters_First_genomes[node] = tmp_genomes
|
|
93
|
+
|
|
94
|
+
for node in edges:
|
|
95
|
+
if node not in visited:
|
|
96
|
+
pangenome_clusters_First[cluster_id] = []
|
|
97
|
+
pangenome_clusters_First_sequences[cluster_id] = []
|
|
98
|
+
pangenome_clusters_First_genomes[node] = []
|
|
99
|
+
dfs(node, cluster_id)
|
|
100
|
+
cluster_id += 1
|
|
101
|
+
|
|
102
|
+
for rep in pangenome_clusters_First:
|
|
103
|
+
cluster_size = len(pangenome_clusters_First_sequences[rep])
|
|
104
|
+
reps[rep] = [cluster_size, len(pangenome_clusters_First[rep])]
|
|
105
|
+
|
|
106
|
+
return taxa_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps
|
|
107
|
+
|
|
108
|
+
def cluster_MMseqs(options,splitter):
|
|
109
|
+
separator = '\t'
|
|
110
|
+
cluster_id = 0
|
|
111
|
+
last_rep = ''
|
|
112
|
+
first = True
|
|
113
|
+
First_in = open(options.clusters, 'r')
|
|
114
|
+
pangenome_clusters_First = OrderedDict()
|
|
115
|
+
pangenome_clusters_First_genomes = OrderedDict()
|
|
116
|
+
pangenome_clusters_First_sequences = OrderedDict()
|
|
117
|
+
taxa_dict = defaultdict(int)
|
|
118
|
+
reps = OrderedDict()
|
|
119
|
+
tmp_genomes = None
|
|
120
|
+
for line in First_in:
|
|
121
|
+
|
|
122
|
+
elements = line.strip().split(separator)
|
|
123
|
+
rep, child = elements[0], elements[1]
|
|
124
|
+
child_taxa = child.split(splitter)[0] # Extracting the genome identifier from the child sequence
|
|
125
|
+
# Counting occurrences of genomes
|
|
126
|
+
taxa_dict[child_taxa] += 1
|
|
127
|
+
if first == True:
|
|
128
|
+
pangenome_clusters_First['0'] = []
|
|
129
|
+
pangenome_clusters_First_sequences['0'] = []
|
|
130
|
+
first = False
|
|
131
|
+
tmp_genomes = []
|
|
132
|
+
|
|
133
|
+
if rep != last_rep and last_rep != '':
|
|
134
|
+
pangenome_clusters_First_genomes[rep] = tmp_genomes
|
|
135
|
+
tmp_genomes = []
|
|
136
|
+
cluster_id +=1
|
|
137
|
+
pangenome_clusters_First[str(cluster_id)] = []
|
|
138
|
+
pangenome_clusters_First_sequences[str(cluster_id)] = []
|
|
139
|
+
cluster_size = len(pangenome_clusters_First_sequences[str(cluster_id-1)])
|
|
140
|
+
reps.update({last_rep: [cluster_size, len(pangenome_clusters_First[str(cluster_id-1)])]})
|
|
141
|
+
pangenome_clusters_First[str(cluster_id)] = []
|
|
142
|
+
pangenome_clusters_First_sequences[str(cluster_id)] = []
|
|
143
|
+
if child_taxa not in pangenome_clusters_First[str(cluster_id)]:
|
|
144
|
+
pangenome_clusters_First[str(cluster_id)].append(child_taxa)
|
|
145
|
+
tmp_genomes.append(child_taxa)
|
|
146
|
+
|
|
147
|
+
pangenome_clusters_First_sequences[str(cluster_id)].append(child)
|
|
148
|
+
last_rep = rep
|
|
149
|
+
cluster_size = len(pangenome_clusters_First_sequences[str(cluster_id)])
|
|
150
|
+
reps.update({rep: [cluster_size, len(pangenome_clusters_First[str(cluster_id)])]})
|
|
151
|
+
|
|
152
|
+
#!!# May not be needed below
|
|
153
|
+
pangenome_clusters_First_genomes[rep] = tmp_genomes
|
|
154
|
+
|
|
155
|
+
return taxa_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps
|
|
55
156
|
|
|
56
157
|
|
|
57
158
|
#@profile
|
|
@@ -138,10 +239,10 @@ def single_clustering_counting(pangenome_clusters_First, reps):
|
|
|
138
239
|
pangenome_clusters_Type = copy.deepcopy(pangenome_clusters_First)
|
|
139
240
|
list_of_reps = list(reps.keys())
|
|
140
241
|
for cluster, First_taxa in pangenome_clusters_First.items():
|
|
141
|
-
rep = list_of_reps[int(cluster)] # get the rep of the current
|
|
242
|
+
rep = list_of_reps[int(cluster)] # get the rep of the current cluster
|
|
142
243
|
|
|
143
244
|
try: # get the cluster from the storf clusters which contains this rep
|
|
144
|
-
num_clustered_First[str(cluster)].append(rep + '_' + str(len(First_taxa)))
|
|
245
|
+
num_clustered_First[str(cluster)].append(str(rep) + '_' + str(len(First_taxa)))
|
|
145
246
|
size_of_First_clusters = []
|
|
146
247
|
Firsts = num_clustered_First[str(cluster)]
|
|
147
248
|
for First in Firsts:
|
|
@@ -178,6 +279,8 @@ def combined_clustering_CDHIT(options, taxa_dict, splitter):
|
|
|
178
279
|
first = True
|
|
179
280
|
for line in Second_in:
|
|
180
281
|
if line.startswith('>'):
|
|
282
|
+
if '>Cluster 1997' in line:
|
|
283
|
+
print()
|
|
181
284
|
if first == False:
|
|
182
285
|
cluster_size = len(Combined_clusters[cluster_id])
|
|
183
286
|
Combined_reps.update({rep: cluster_size})
|
|
@@ -196,6 +299,7 @@ def combined_clustering_CDHIT(options, taxa_dict, splitter):
|
|
|
196
299
|
VALUE = combined_pangenome_clusters_Second_sequences[cluster_id]
|
|
197
300
|
KEY = combined_pangenome_clusters_First_sequences[cluster_id][0]
|
|
198
301
|
combined_pangenome_clusters_First_Second_clustered.update({KEY: VALUE})
|
|
302
|
+
|
|
199
303
|
cluster_id = line.strip('>')
|
|
200
304
|
cluster_id = cluster_id.strip('\n')
|
|
201
305
|
cluster_id = cluster_id.split(' ')[1]
|
|
@@ -233,55 +337,40 @@ def combined_clustering_CDHIT(options, taxa_dict, splitter):
|
|
|
233
337
|
return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences
|
|
234
338
|
|
|
235
339
|
|
|
236
|
-
def cluster_EdgeList(options,splitter):
|
|
237
|
-
if options.cluster_format == 'TSV':
|
|
238
|
-
separator = '\t'
|
|
239
|
-
elif options.cluster_format == 'CSV':
|
|
240
|
-
separator = ','
|
|
241
|
-
cluster_id = 0
|
|
242
|
-
last_rep = ''
|
|
243
|
-
first = True
|
|
244
|
-
First_in = open(options.clusters, 'r')
|
|
245
|
-
pangenome_clusters_First = OrderedDict()
|
|
246
|
-
pangenome_clusters_First_genomes = OrderedDict()
|
|
247
|
-
pangenome_clusters_First_sequences = OrderedDict()
|
|
248
|
-
taxa_dict = defaultdict(int)
|
|
249
|
-
reps = OrderedDict()
|
|
250
|
-
tmp_genomes = None
|
|
251
|
-
for line in First_in:
|
|
252
|
-
rep, child = line.strip().split(separator)
|
|
253
|
-
child_taxa = child.split(splitter)[0] # Extracting the genome identifier from the child sequence
|
|
254
|
-
# Counting occurrences of genomes
|
|
255
|
-
taxa_dict[child_taxa] += 1
|
|
256
|
-
if first == True:
|
|
257
|
-
pangenome_clusters_First['0'] = []
|
|
258
|
-
pangenome_clusters_First_sequences['0'] = []
|
|
259
|
-
first = False
|
|
260
|
-
tmp_genomes = []
|
|
261
340
|
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
341
|
+
# def cluster_BLAST(options, splitter):
|
|
342
|
+
# separator = '\t'
|
|
343
|
+
# First_in = open(options.clusters, 'r')
|
|
344
|
+
# pangenome_clusters_First = OrderedDict()
|
|
345
|
+
# pangenome_clusters_First_genomes = defaultdict(list)
|
|
346
|
+
# pangenome_clusters_First_sequences = defaultdict(list)
|
|
347
|
+
# taxa_dict = defaultdict(int)
|
|
348
|
+
# reps = OrderedDict()
|
|
349
|
+
#
|
|
350
|
+
# for line in First_in:
|
|
351
|
+
# elements = line.strip().split(separator)
|
|
352
|
+
# rep, child = elements[0], elements[1]
|
|
353
|
+
# child_taxa = child.split(splitter)[0] # Extracting the genome identifier from the child sequence
|
|
354
|
+
# # Counting occurrences of genomes
|
|
355
|
+
# taxa_dict[child_taxa] += 1
|
|
356
|
+
#
|
|
357
|
+
# if rep not in pangenome_clusters_First:
|
|
358
|
+
# pangenome_clusters_First[rep] = []
|
|
359
|
+
# pangenome_clusters_First_sequences[rep] = []
|
|
360
|
+
#
|
|
361
|
+
# if child_taxa not in pangenome_clusters_First[rep]:
|
|
362
|
+
# pangenome_clusters_First[rep].append(child_taxa)
|
|
363
|
+
# pangenome_clusters_First_genomes[rep].append(child_taxa)
|
|
364
|
+
#
|
|
365
|
+
# pangenome_clusters_First_sequences[rep].append(child)
|
|
366
|
+
#
|
|
367
|
+
# for rep in pangenome_clusters_First:
|
|
368
|
+
# cluster_size = len(pangenome_clusters_First_sequences[rep])
|
|
369
|
+
# reps[rep] = [cluster_size, len(pangenome_clusters_First[rep])]
|
|
370
|
+
#
|
|
371
|
+
# return taxa_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps
|
|
275
372
|
|
|
276
|
-
pangenome_clusters_First_sequences[str(cluster_id)].append(child)
|
|
277
|
-
last_rep = rep
|
|
278
|
-
cluster_size = len(pangenome_clusters_First_sequences[str(cluster_id)])
|
|
279
|
-
reps.update({rep: [cluster_size, len(pangenome_clusters_First[str(cluster_id)])]})
|
|
280
|
-
|
|
281
|
-
#!!# May not be needed below
|
|
282
|
-
pangenome_clusters_First_genomes[rep] = tmp_genomes
|
|
283
373
|
|
|
284
|
-
return taxa_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps
|
|
285
374
|
|
|
286
375
|
|
|
287
376
|
def combined_clustering_Edge_List(options, splitter):
|
|
@@ -305,7 +394,8 @@ def combined_clustering_Edge_List(options, splitter):
|
|
|
305
394
|
Combined_reps = OrderedDict()
|
|
306
395
|
first = True
|
|
307
396
|
for line in Second_in:
|
|
308
|
-
|
|
397
|
+
elements = line.strip().split(separator)
|
|
398
|
+
rep, child = elements[0], elements[1]
|
|
309
399
|
child_taxa = child.split(splitter)[0] # Extracting the genome identifier from the child sequence
|
|
310
400
|
|
|
311
401
|
if first == True:
|
PyamilySeq/constants.py
ADDED