PyamilySeq 0.9.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,32 +1,27 @@
1
- #from line_profiler_pycharm import profile
2
-
3
- import math
4
1
 
5
2
  try:
6
- from .Constants import *
3
+ from .constants import *
7
4
  from .clusterings import *
8
5
  from .utils import *
9
6
  except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
10
- from Constants import *
7
+ from constants import *
11
8
  from clusterings import *
12
9
  from utils import *
13
10
 
14
11
 
15
- #def output_fasta(options, gene_families):
16
-
17
12
  def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
18
13
  print("Outputting gene_presence_absence file")
19
14
  output_dir = os.path.abspath(options.output_dir)
20
- in_name = options.clusters.split('.')[0].split('/')[-1]
21
- gpa_outfile = os.path.join(output_dir, in_name)
22
- gpa_outfile = open(gpa_outfile+'_gene_presence_absence.csv','w')
15
+ #in_name = options.clusters.split('.')[0].split('/')[-1]
16
+ gpa_outfile = os.path.join(output_dir, 'gene_presence_absence.csv')
17
+ gpa_outfile = open(gpa_outfile, 'w')
23
18
  gpa_outfile.write('"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","'
24
19
  '"Accessory Fragment","Accessory Order with Fragment","QC","Min group size nuc","Max group size nuc","Avg group size nuc","')
25
20
  gpa_outfile.write('","'.join(genome_dict.keys()))
26
21
  gpa_outfile.write('"\n')
27
22
  for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
28
23
  average_sequences_per_genome = len(sequences) / len(pangenome_clusters_First_sorted[cluster])
29
- gpa_outfile.write('"group_'+str(cluster)+'","","'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
24
+ gpa_outfile.write('"group_'+str(cluster)+'","","",'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
30
25
  '","","","","","","","","",""')
31
26
 
32
27
 
@@ -35,9 +30,9 @@ def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_
35
30
  tmp_list = []
36
31
  for value in sequences:
37
32
  if value.split('|')[0] == genome:
38
- tmp_list.append(value)
33
+ tmp_list.append(value.split('|')[1])
39
34
  if tmp_list:
40
- full_out += ',"'+''.join(tmp_list)+'"'
35
+ full_out += ',"'+'\t'.join(tmp_list)+'"'
41
36
  else:
42
37
  full_out = ',""'
43
38
  gpa_outfile.write(full_out)
@@ -64,7 +59,7 @@ def get_cores(options,genome_dict):
64
59
  cores = OrderedDict()
65
60
  prev_top = len(genome_dict)
66
61
  first = True
67
- for group in options.core_groups.split(','):
62
+ for group in options.species_groups.split(','):
68
63
  calculated_floor = math.floor(int(group) / 100 * len(genome_dict))
69
64
  if first == False:
70
65
  groups[group] = (calculated_floor,prev_top)
@@ -138,14 +133,16 @@ def cluster(options):
138
133
 
139
134
  if options.cluster_format == 'CD-HIT':
140
135
  genome_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options, '|')
141
- elif 'TSV' in options.cluster_format or 'CSV' in options.cluster_format:
142
- genome_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_EdgeList(options, '|')
136
+ elif 'BLAST' in options.cluster_format:
137
+ genome_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_BLAST(options, '|')
138
+ elif 'MMseqs' in options.cluster_format:
139
+ genome_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_MMseqs(options, '|')
143
140
 
144
141
  ###
145
142
  cores, groups = get_cores(options, genome_dict)
146
143
  ###
147
144
 
148
- if options.reclustered != None:
145
+ if options.reclustered != None: #FIX
149
146
  if options.cluster_format == 'CD-HIT':
150
147
  combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences = combined_clustering_CDHIT(options, genome_dict, '|')
151
148
  elif 'TSV' in options.cluster_format or 'CSV' in options.cluster_format:
@@ -169,8 +166,6 @@ def cluster(options):
169
166
  for cluster, numbers in pangenome_clusters_Type_sorted.items():
170
167
  ############################### Calculate First only
171
168
  cluster = str(cluster)
172
- if '78' in cluster:
173
- pass
174
169
  for grouping in numbers[2]: #!!# Could do with a more elegant solution
175
170
  current_cluster = grouping[0].split(':')[0]
176
171
  if current_cluster not in seen_groupings:
@@ -210,8 +205,10 @@ def cluster(options):
210
205
  stats_out = os.path.join(output_path,'summary_statistics.txt')
211
206
  key_order = ['First_core_', 'extended_core_', 'combined_core_', 'Second_core_','only_Second_core_']
212
207
  with open(stats_out, 'w') as outfile:
208
+ print("Number of Genomes: " + str(len(genome_dict)))
209
+ outfile.write("Number of Genomes: " + str(len(genome_dict)) + "\n")
213
210
  print("Gene Groups:")
214
- outfile.write("Gene Groups:\n")
211
+ outfile.write("Gene Groups\n")
215
212
  for key_prefix in key_order:
216
213
  for key, value in cores.items():
217
214
  if key.startswith(key_prefix):
@@ -236,34 +233,59 @@ def cluster(options):
236
233
  ###Need to fix this below. If full/partial the ifs need to be different. If full we first need to output the gfs then align. if -wruite-groups not presented then it needs
237
234
  # to be done for alignment full anyway...
238
235
 
236
+ genome_list = list(genome_dict.keys())
239
237
  if options.run_mode == 'Full':
238
+ sequences = read_fasta(options.fasta)
240
239
  if options.reclustered == None:
241
240
  combined_pangenome_clusters_Second_sequences = None
241
+ ## Output representative sequences
242
+ representatives_out = os.path.join(output_path,'pan_genome_reference.fa')
243
+ with open(representatives_out, 'w') as outfile:
244
+ for cluster, ids in pangenome_clusters_First_sequences.items():
245
+ outfile.write('>group_'+str(cluster)+'\n')
246
+ wrapped_aa_seq = wrap_sequence(sequences[ids[0]], 60)
247
+ outfile.write(wrapped_aa_seq+'\n')
242
248
  if options.write_groups != None:
243
249
  print("Outputting gene group FASTA files")
244
- sequences = read_fasta(options.fasta)
245
250
  #output_dir = os.path.dirname(os.path.abspath(options.output_dir))
246
- output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
247
- write_groups(options,output_dir, key_order, cores, sequences,
251
+ output_dir = os.path.join(options.output_dir, 'Gene_Groups_Output')
252
+ write_groups_func(options,output_dir, key_order, cores, sequences,
248
253
  pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
249
254
 
250
255
  if options.align_core != None:
251
256
  print("Processing gene group alignment")
252
- process_gene_families(options, output_dir, 'concatenated_genes_aligned.fasta')
257
+ process_gene_groups(options, output_dir, None, None, genome_list, 'core_gene_alignment.aln')
253
258
 
254
259
  elif options.run_mode == 'Partial':
260
+ sequences = read_fasta(options.fasta)
255
261
  if options.reclustered == None:
256
262
  combined_pangenome_clusters_Second_sequences = None
257
- if options.write_groups != None and options.fasta != None:
263
+ # else: ## Output representative sequences - Under development
264
+ # representatives_out = os.path.join(output_path, 'pan_genome_reference_reclustered.fa')
265
+ # with open(representatives_out, 'w') as outfile:
266
+ # for cluster, ids in combined_pangenome_clusters_Second_sequences.items():
267
+ # outfile.write('>group_' + str(cluster) + '\n')
268
+ # try:
269
+ # wrapped_aa_seq = wrap_sequence(sequences[ids[0]], 60)
270
+ # except:
271
+ # print(2)
272
+ # outfile.write(wrapped_aa_seq + '\n')
273
+ ## Output representative sequences
274
+ representatives_out = os.path.join(output_path,'pan_genome_reference.fa')
275
+ with open(representatives_out, 'w') as outfile:
276
+ for cluster, ids in pangenome_clusters_First_sequences.items():
277
+ outfile.write('>group_'+str(cluster)+'\n')
278
+ wrapped_aa_seq = wrap_sequence(sequences[ids[0]], 60)
279
+ outfile.write(wrapped_aa_seq+'\n')
280
+ if options.write_groups != None:
258
281
  print("Outputting gene group FASTA files")
259
- sequences = read_fasta(options.fasta)
260
- output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
261
- write_groups(options,output_dir, key_order, cores, sequences,
282
+ output_dir = os.path.join(options.output_dir, 'Gene_Groups_Output')
283
+ write_groups_func(options,output_dir, key_order, cores, sequences,
262
284
  pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
263
285
 
264
286
  if options.align_core != None:
265
287
  print("Processing gene group alignment")
266
- process_gene_families(options, output_dir, 'concatenated_genes_aligned.fasta')
288
+ process_gene_groups(options, output_dir, None, None, genome_list, 'core_gene_alignment.aln')
267
289
 
268
290
 
269
291
 
@@ -2,10 +2,10 @@ import argparse
2
2
 
3
3
 
4
4
  try:
5
- from .Constants import *
5
+ from .constants import *
6
6
  from .utils import *
7
7
  except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
8
- from Constants import *
8
+ from constants import *
9
9
  from utils import *
10
10
 
11
11
 
@@ -29,7 +29,7 @@ def main():
29
29
  help="Directory for all output files.",
30
30
  required=True)
31
31
  required.add_argument("-output_name", action="store", dest="output_file",
32
- help="Output file name.",
32
+ help="Output file name (without .fasta).",
33
33
  required=True)
34
34
 
35
35
  optional = parser.add_argument_group('Optional Arguments')
@@ -38,7 +38,7 @@ def main():
38
38
  ' - Not compatible with "fasta" input mode.',
39
39
  required=False)
40
40
  optional.add_argument('-translate', action='store_true', dest='translate', default=None,
41
- help='Default - False: Translate extracted sequences to their AA counterpart?',
41
+ help='Default - False: Translate extracted sequences to their AA counterpart? - appends _aa.fasta to given output_name',
42
42
  required=False)
43
43
  misc = parser.add_argument_group('Misc Arguments')
44
44
  misc.add_argument("-v", "--version", action="version",
@@ -47,14 +47,13 @@ def main():
47
47
 
48
48
  options = parser.parse_args()
49
49
 
50
- if options.version:
51
- sys.exit(PyamilySeq_Version)
50
+
52
51
 
53
52
  output_path = os.path.abspath(options.output_dir)
54
53
  if not os.path.exists(output_path):
55
54
  os.makedirs(output_path)
56
55
 
57
- combined_out_file = os.path.join(output_path, options.output_file)
56
+ combined_out_file = os.path.join(output_path, options.output_file + '.fasta')
58
57
 
59
58
  if options.input_type == 'separate':
60
59
  read_separate_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, options.translate)
@@ -0,0 +1,64 @@
1
+ import argparse
2
+ import copy
3
+
4
+ def find_gene_ids_in_csv(csv_file, group_name):
5
+ """Find gene IDs associated with the specified group name in the CSV file, starting from column 14."""
6
+ gene_ids = []
7
+ with open(csv_file, 'r') as f:
8
+ for line in f:
9
+ cells = line.strip().split(',')
10
+ if cells[0].replace('"','') == group_name:
11
+ # Collect gene IDs from column 14 onward
12
+ for cell in cells[14:]:
13
+ gene_ids.extend(cell.strip().replace('"','').split()) # Splitting by spaces if there are multiple IDs in a cell break
14
+ return gene_ids
15
+
16
+ def extract_sequences(fasta_file, gene_ids):
17
+ """Extract sequences from the FASTA file that match the gene IDs."""
18
+ sequences = {}
19
+ capture = False
20
+ current_id = ""
21
+ not_found = copy.deepcopy(gene_ids)
22
+ with open(fasta_file, 'r') as f:
23
+ for line in f:
24
+ if line.startswith('>'):
25
+ # Extract the ID part after '>' and check if it's in gene_ids
26
+ current_id = line[1:].strip().split()[0].split('|')[1]
27
+ capture = current_id in gene_ids
28
+ if current_id in not_found:
29
+ not_found.remove(current_id)
30
+ if capture:
31
+ sequences[current_id] = [line.strip()] # Start with header line
32
+ elif capture:
33
+ sequences[current_id].append(line.strip()) # Append sequence lines
34
+ return sequences
35
+
36
+ def main():
37
+ parser = argparse.ArgumentParser(description="Extract sequences for specified group name from CSV file and corresponding FASTA file.")
38
+ parser.add_argument("-csv", action='store', dest='csv_file',
39
+ help="CSV file containing group data", required=True)
40
+ parser.add_argument("-group", action='store', dest='group_name',
41
+ help="Group name to search for in the CSV", required=True)
42
+ parser.add_argument("-fasta", action='store', dest='fasta_file',
43
+ help="Input FASTA file containing sequences", required=True)
44
+ parser.add_argument("-out", action='store', dest='output_file',
45
+ help="Output FASTA file with extracted sequences", required=True)
46
+
47
+ options = parser.parse_args()
48
+
49
+ # Find gene IDs in CSV
50
+ gene_ids = find_gene_ids_in_csv(options.csv_file, options.group_name)
51
+ if not gene_ids:
52
+ print(f"No gene IDs found for group name '{options.group_name}' in the CSV.")
53
+ return
54
+
55
+ # Extract sequences from the FASTA file
56
+ sequences = extract_sequences(options.fasta_file, gene_ids)
57
+
58
+ # Write matched sequences to the output FASTA file
59
+ with open(options.output_file, 'w') as output:
60
+ for gene_id, sequence_lines in sequences.items():
61
+ output.write("\n".join(sequence_lines) + "\n")
62
+
63
+ if __name__ == "__main__":
64
+ main()
@@ -0,0 +1,56 @@
1
+ import argparse
2
+ import collections
3
+ import csv
4
+
5
+
6
+ def parse_fasta_ids(fasta_file):
7
+ """Extract IDs from the FASTA file."""
8
+ ids = []
9
+ with open(fasta_file, 'r') as f:
10
+ for line in f:
11
+ if line.startswith('>'):
12
+ seq_id = line[1:].strip().split()[0] # Capture the ID after '>'
13
+ ids.append(seq_id)
14
+ return ids
15
+
16
+
17
+ def find_ids_in_csv(ids, csv_file):
18
+ """Search for each ID in the CSV file and report the first column where it is found."""
19
+ found_records = collections.defaultdict(list)
20
+ with open(csv_file, 'r') as f:
21
+ csv_reader = csv.reader(f)
22
+ for row in csv_reader:
23
+ if row: # Ensure row is not empty
24
+
25
+ for id in ids: # slow
26
+ if id in row:
27
+ found_records[row[0]].append(id)
28
+ return found_records
29
+
30
+
31
+ def main():
32
+ parser = argparse.ArgumentParser(description="Extract IDs from a FASTA file and search for them in a CSV file.")
33
+ parser.add_argument("-in", action='store', dest='fasta_file',
34
+ help="Input FASTA file", required=True)
35
+ parser.add_argument("-ids", action='store', dest='csv_file',
36
+ help="CSV file containing IDs to search for", required=True)
37
+ parser.add_argument("-out", action='store', dest='output_file',
38
+ help="Output file to save found IDs", required=True)
39
+
40
+ options = parser.parse_args()
41
+
42
+ # Parse IDs from the FASTA file
43
+ ids = parse_fasta_ids(options.fasta_file)
44
+
45
+ # Find IDs in the CSV file
46
+ found_records = find_ids_in_csv(ids, options.csv_file)
47
+
48
+ # Write output
49
+ with open(options.output_file, 'w') as output:
50
+ output.write("ID,Found_In_First_Column\n")
51
+ for seq_id, found_in in found_records.items():
52
+ output.write(f"{seq_id},{found_in}\n")
53
+
54
+
55
+ if __name__ == "__main__":
56
+ main()
PyamilySeq/clusterings.py CHANGED
@@ -52,6 +52,107 @@ def cluster_CDHIT(options, splitter):
52
52
 
53
53
  return taxa_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps
54
54
 
55
+ def cluster_BLAST(options, splitter):
56
+ separator = '\t'
57
+ First_in = open(options.clusters, 'r')
58
+ pangenome_clusters_First = OrderedDict()
59
+ pangenome_clusters_First_genomes = defaultdict(list)
60
+ pangenome_clusters_First_sequences = defaultdict(list)
61
+ taxa_dict = defaultdict(int)
62
+ reps = OrderedDict()
63
+ edges = defaultdict(list)
64
+ for line in First_in:
65
+ elements = line.strip().split(separator)
66
+ rep, child = elements[0], elements[1]
67
+ child_taxa = child.split(splitter)[0] # Extracting the genome identifier from the child sequence
68
+ # Counting occurrences of genomes
69
+ taxa_dict[child_taxa] += 1
70
+ edges[rep].append(child)
71
+ edges[child].append(rep)
72
+
73
+ visited = set()
74
+ cluster_id = 0
75
+
76
+ def dfs(node, cluster_id):
77
+ stack = [node]
78
+ tmp_genomes = []
79
+ while stack:
80
+ current = stack.pop()
81
+ if current not in visited:
82
+ visited.add(current)
83
+ clustered_taxa = current.split(splitter)[0]
84
+ pangenome_clusters_First_sequences[cluster_id].append(current)
85
+ if clustered_taxa not in pangenome_clusters_First[cluster_id]:
86
+ pangenome_clusters_First[cluster_id].append(clustered_taxa)
87
+ tmp_genomes.append(clustered_taxa)
88
+ for neighbor in edges[current]:
89
+ if neighbor not in visited:
90
+ stack.append(neighbor)
91
+
92
+ pangenome_clusters_First_genomes[node] = tmp_genomes
93
+
94
+ for node in edges:
95
+ if node not in visited:
96
+ pangenome_clusters_First[cluster_id] = []
97
+ pangenome_clusters_First_sequences[cluster_id] = []
98
+ pangenome_clusters_First_genomes[node] = []
99
+ dfs(node, cluster_id)
100
+ cluster_id += 1
101
+
102
+ for rep in pangenome_clusters_First:
103
+ cluster_size = len(pangenome_clusters_First_sequences[rep])
104
+ reps[rep] = [cluster_size, len(pangenome_clusters_First[rep])]
105
+
106
+ return taxa_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps
107
+
108
+ def cluster_MMseqs(options,splitter):
109
+ separator = '\t'
110
+ cluster_id = 0
111
+ last_rep = ''
112
+ first = True
113
+ First_in = open(options.clusters, 'r')
114
+ pangenome_clusters_First = OrderedDict()
115
+ pangenome_clusters_First_genomes = OrderedDict()
116
+ pangenome_clusters_First_sequences = OrderedDict()
117
+ taxa_dict = defaultdict(int)
118
+ reps = OrderedDict()
119
+ tmp_genomes = None
120
+ for line in First_in:
121
+
122
+ elements = line.strip().split(separator)
123
+ rep, child = elements[0], elements[1]
124
+ child_taxa = child.split(splitter)[0] # Extracting the genome identifier from the child sequence
125
+ # Counting occurrences of genomes
126
+ taxa_dict[child_taxa] += 1
127
+ if first == True:
128
+ pangenome_clusters_First['0'] = []
129
+ pangenome_clusters_First_sequences['0'] = []
130
+ first = False
131
+ tmp_genomes = []
132
+
133
+ if rep != last_rep and last_rep != '':
134
+ pangenome_clusters_First_genomes[rep] = tmp_genomes
135
+ tmp_genomes = []
136
+ cluster_id +=1
137
+ pangenome_clusters_First[str(cluster_id)] = []
138
+ pangenome_clusters_First_sequences[str(cluster_id)] = []
139
+ cluster_size = len(pangenome_clusters_First_sequences[str(cluster_id-1)])
140
+ reps.update({last_rep: [cluster_size, len(pangenome_clusters_First[str(cluster_id-1)])]})
141
+ pangenome_clusters_First[str(cluster_id)] = []
142
+ pangenome_clusters_First_sequences[str(cluster_id)] = []
143
+ if child_taxa not in pangenome_clusters_First[str(cluster_id)]:
144
+ pangenome_clusters_First[str(cluster_id)].append(child_taxa)
145
+ tmp_genomes.append(child_taxa)
146
+
147
+ pangenome_clusters_First_sequences[str(cluster_id)].append(child)
148
+ last_rep = rep
149
+ cluster_size = len(pangenome_clusters_First_sequences[str(cluster_id)])
150
+ reps.update({rep: [cluster_size, len(pangenome_clusters_First[str(cluster_id)])]})
151
+
152
+ #!!# May not be needed below
153
+ pangenome_clusters_First_genomes[rep] = tmp_genomes
154
+
155
+ return taxa_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps
55
156
 
56
157
 
57
158
  #@profile
@@ -138,10 +239,10 @@ def single_clustering_counting(pangenome_clusters_First, reps):
138
239
  pangenome_clusters_Type = copy.deepcopy(pangenome_clusters_First)
139
240
  list_of_reps = list(reps.keys())
140
241
  for cluster, First_taxa in pangenome_clusters_First.items():
141
- rep = list_of_reps[int(cluster)] # get the rep of the current pep cluster
242
+ rep = list_of_reps[int(cluster)] # get the rep of the current cluster
142
243
 
143
244
  try: # get the cluster from the storf clusters which contains this rep
144
- num_clustered_First[str(cluster)].append(rep + '_' + str(len(First_taxa)))
245
+ num_clustered_First[str(cluster)].append(str(rep) + '_' + str(len(First_taxa)))
145
246
  size_of_First_clusters = []
146
247
  Firsts = num_clustered_First[str(cluster)]
147
248
  for First in Firsts:
@@ -178,6 +279,8 @@ def combined_clustering_CDHIT(options, taxa_dict, splitter):
178
279
  first = True
179
280
  for line in Second_in:
180
281
  if line.startswith('>'):
282
+ if '>Cluster 1997' in line:
283
+ print()
181
284
  if first == False:
182
285
  cluster_size = len(Combined_clusters[cluster_id])
183
286
  Combined_reps.update({rep: cluster_size})
@@ -196,6 +299,7 @@ def combined_clustering_CDHIT(options, taxa_dict, splitter):
196
299
  VALUE = combined_pangenome_clusters_Second_sequences[cluster_id]
197
300
  KEY = combined_pangenome_clusters_First_sequences[cluster_id][0]
198
301
  combined_pangenome_clusters_First_Second_clustered.update({KEY: VALUE})
302
+
199
303
  cluster_id = line.strip('>')
200
304
  cluster_id = cluster_id.strip('\n')
201
305
  cluster_id = cluster_id.split(' ')[1]
@@ -233,55 +337,40 @@ def combined_clustering_CDHIT(options, taxa_dict, splitter):
233
337
  return combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids, combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences
234
338
 
235
339
 
236
- def cluster_EdgeList(options,splitter):
237
- if options.cluster_format == 'TSV':
238
- separator = '\t'
239
- elif options.cluster_format == 'CSV':
240
- separator = ','
241
- cluster_id = 0
242
- last_rep = ''
243
- first = True
244
- First_in = open(options.clusters, 'r')
245
- pangenome_clusters_First = OrderedDict()
246
- pangenome_clusters_First_genomes = OrderedDict()
247
- pangenome_clusters_First_sequences = OrderedDict()
248
- taxa_dict = defaultdict(int)
249
- reps = OrderedDict()
250
- tmp_genomes = None
251
- for line in First_in:
252
- rep, child = line.strip().split(separator)
253
- child_taxa = child.split(splitter)[0] # Extracting the genome identifier from the child sequence
254
- # Counting occurrences of genomes
255
- taxa_dict[child_taxa] += 1
256
- if first == True:
257
- pangenome_clusters_First['0'] = []
258
- pangenome_clusters_First_sequences['0'] = []
259
- first = False
260
- tmp_genomes = []
261
340
 
262
- if rep != last_rep and last_rep != '':
263
- pangenome_clusters_First_genomes[rep] = tmp_genomes
264
- tmp_genomes = []
265
- cluster_id +=1
266
- pangenome_clusters_First[str(cluster_id)] = []
267
- pangenome_clusters_First_sequences[str(cluster_id)] = []
268
- cluster_size = len(pangenome_clusters_First_sequences[str(cluster_id-1)])
269
- reps.update({last_rep: [cluster_size, len(pangenome_clusters_First[str(cluster_id-1)])]})
270
- pangenome_clusters_First[str(cluster_id)] = []
271
- pangenome_clusters_First_sequences[str(cluster_id)] = []
272
- if child_taxa not in pangenome_clusters_First[str(cluster_id)]:
273
- pangenome_clusters_First[str(cluster_id)].append(child_taxa)
274
- tmp_genomes.append(child_taxa)
341
+ # def cluster_BLAST(options, splitter):
342
+ # separator = '\t'
343
+ # First_in = open(options.clusters, 'r')
344
+ # pangenome_clusters_First = OrderedDict()
345
+ # pangenome_clusters_First_genomes = defaultdict(list)
346
+ # pangenome_clusters_First_sequences = defaultdict(list)
347
+ # taxa_dict = defaultdict(int)
348
+ # reps = OrderedDict()
349
+ #
350
+ # for line in First_in:
351
+ # elements = line.strip().split(separator)
352
+ # rep, child = elements[0], elements[1]
353
+ # child_taxa = child.split(splitter)[0] # Extracting the genome identifier from the child sequence
354
+ # # Counting occurrences of genomes
355
+ # taxa_dict[child_taxa] += 1
356
+ #
357
+ # if rep not in pangenome_clusters_First:
358
+ # pangenome_clusters_First[rep] = []
359
+ # pangenome_clusters_First_sequences[rep] = []
360
+ #
361
+ # if child_taxa not in pangenome_clusters_First[rep]:
362
+ # pangenome_clusters_First[rep].append(child_taxa)
363
+ # pangenome_clusters_First_genomes[rep].append(child_taxa)
364
+ #
365
+ # pangenome_clusters_First_sequences[rep].append(child)
366
+ #
367
+ # for rep in pangenome_clusters_First:
368
+ # cluster_size = len(pangenome_clusters_First_sequences[rep])
369
+ # reps[rep] = [cluster_size, len(pangenome_clusters_First[rep])]
370
+ #
371
+ # return taxa_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps
275
372
 
276
- pangenome_clusters_First_sequences[str(cluster_id)].append(child)
277
- last_rep = rep
278
- cluster_size = len(pangenome_clusters_First_sequences[str(cluster_id)])
279
- reps.update({rep: [cluster_size, len(pangenome_clusters_First[str(cluster_id)])]})
280
-
281
- #!!# May not be needed below
282
- pangenome_clusters_First_genomes[rep] = tmp_genomes
283
373
 
284
- return taxa_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps
285
374
 
286
375
 
287
376
  def combined_clustering_Edge_List(options, splitter):
@@ -305,7 +394,8 @@ def combined_clustering_Edge_List(options, splitter):
305
394
  Combined_reps = OrderedDict()
306
395
  first = True
307
396
  for line in Second_in:
308
- rep, child = line.strip().split(separator)
397
+ elements = line.strip().split(separator)
398
+ rep, child = elements[0], elements[1]
309
399
  child_taxa = child.split(splitter)[0] # Extracting the genome identifier from the child sequence
310
400
 
311
401
  if first == True:
@@ -0,0 +1,2 @@
1
+ PyamilySeq_Version = 'v1.0.1'
2
+