PyamilySeq 0.9.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,17 @@
1
+ Metadata-Version: 2.1
2
+ Name: PyamilySeq
3
+ Version: 1.0.0
4
+ Summary: rORForise - A a tool to study read-level gene predictions.
5
+ Home-page: https://github.com/NickJD/rORForise
6
+ Author: Nicholas Dimonaco
7
+ Author-email: nicholas@dimonaco.co.uk
8
+ Project-URL: Bug Tracker, https://github.com/NickJD/rORForise/issues
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.6
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+
16
+ # rORForise
17
+ Read-based gene coverage evaluation
@@ -0,0 +1,6 @@
1
+ PyamilySeq-1.0.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
2
+ PyamilySeq-1.0.0.dist-info/METADATA,sha256=AmvKK-9jDxFly93v2XT9WpmdU6n1jEPHCw7CgHr7ktM,608
3
+ PyamilySeq-1.0.0.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
4
+ PyamilySeq-1.0.0.dist-info/entry_points.txt,sha256=Ip84PS-IG05XWHiA98MiXE9AJVmqTa5O7BQ2cywrDoo,49
5
+ PyamilySeq-1.0.0.dist-info/top_level.txt,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
6
+ PyamilySeq-1.0.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.1.0)
2
+ Generator: setuptools (75.3.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ eval = rORForise.evaluate:main
@@ -0,0 +1 @@
1
+
@@ -1,163 +0,0 @@
1
- import argparse
2
- from collections import OrderedDict
3
- from collections import defaultdict
4
-
5
- try:
6
- from .Constants import *
7
- from .utils import *
8
- except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
9
- from Constants import *
10
- from utils import *
11
-
12
-
13
- def categorise_percentage(percent):
14
- """Categorise the percentage of genomes with multicopy genes."""
15
- if 20 <= percent < 40:
16
- return "20-40%"
17
- elif 40 <= percent < 60:
18
- return "40-60%"
19
- elif 60 <= percent < 80:
20
- return "60-80%"
21
- elif 80 <= percent < 95:
22
- return "80-95%"
23
- elif 95 <= percent < 99:
24
- return "95-99%"
25
- elif 99 <= percent <= 100:
26
- return "99-100%"
27
- return None
28
-
29
- # Read cd-hit .clstr file and extract information
30
- def read_cd_hit_output(clustering_output):
31
- clusters = OrderedDict()
32
-
33
- with open(clustering_output, 'r') as f:
34
- current_cluster_id = None
35
-
36
- for line in f:
37
- line = line.strip()
38
- if line.startswith(">Cluster"):
39
- current_cluster_id = line.split(' ')[1]
40
- clusters[current_cluster_id] = []
41
- elif line and current_cluster_id is not None:
42
- parts = line.split('\t')
43
- if len(parts) > 1:
44
- clustered_info = parts[1]
45
- length = clustered_info.split(',')[0]
46
- length = int(''.join(c for c in length if c.isdigit()))
47
- clustered_header = clustered_info.split('>')[1].split('...')[0]
48
- clustered_header = '>' + clustered_header
49
-
50
- if 'at ' in clustered_info and '%' in clustered_info.split('at ')[-1]:
51
- percent_identity = extract_identity(clustered_info)
52
- elif line.endswith('*'):
53
- percent_identity = 100.0
54
- else:
55
- raise ValueError("Percent identity not found in the string.")
56
-
57
- clusters[current_cluster_id].append({
58
- 'header': clustered_header,
59
- 'length': length,
60
- 'percent_identity': percent_identity
61
- })
62
-
63
- return clusters
64
-
65
-
66
- # Summarise the information for each cluster
67
- def summarise_clusters(options,clusters, output):
68
- multicopy_groups = defaultdict(int) # Counter for groups with multicopy genes
69
-
70
- with open(output, 'w') as out_f:
71
- out_f.write("Cluster_ID\tNum_Sequences\tAvg_Length\tLength_Range\tAvg_Identity\tIdentity_Range\n")
72
-
73
- for cluster_id, seqs in clusters.items():
74
- num_seqs = len(seqs)
75
- lengths = [seq['length'] for seq in seqs]
76
- identities = [seq['percent_identity'] for seq in seqs]
77
-
78
- avg_length = sum(lengths) / num_seqs if num_seqs > 0 else 0
79
- length_range = f"{min(lengths)}-{max(lengths)}" if num_seqs > 0 else "N/A"
80
-
81
- avg_identity = sum(identities) / num_seqs if num_seqs > 0 else 0
82
- identity_range = f"{min(identities):.2f}-{max(identities):.2f}" if num_seqs > 0 else "N/A"
83
-
84
- out_f.write(
85
- f"{cluster_id}\t{num_seqs}\t{avg_length:.2f}\t{length_range}\t{avg_identity:.2f}\t{identity_range}\n")
86
-
87
- # Count genomes with more than one gene
88
- genome_to_gene_count = defaultdict(int)
89
- for seq in seqs:
90
- genome = seq['header'].split('|')[0].replace('>','')
91
- genome_to_gene_count[genome] += 1
92
-
93
- num_genomes_with_multiple_genes = sum(1 for count in genome_to_gene_count.values() if count > 1)
94
-
95
- # Calculate the percentage of genomes with multicopy genes
96
-
97
- multicopy_percentage = (num_genomes_with_multiple_genes / options.genome_num) * 100
98
- category = categorise_percentage(multicopy_percentage)
99
- if category:
100
- multicopy_groups[category] += 1
101
-
102
- # Define the order of categories for printout
103
- category_order = ["20-40%", "40-60%", "60-80%", "80-95%", "95-99%", "99-100%"]
104
-
105
- # Print the number of clusters with multicopy genes in each percentage range, in the correct order
106
- for category in category_order:
107
- print(f"Number of clusters with multicopy genes in {category} range: {multicopy_groups[category]}")
108
-
109
-
110
- # Main function to parse arguments and run the analysis
111
- def main():
112
- parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': Cluster-Summary - A tool to summarise CD-HIT clustering files.')
113
- ### Required Arguments
114
- required = parser.add_argument_group('Required Parameters')
115
- required.add_argument('-input_clstr', action="store", dest="input_clstr",
116
- help='Input CD-HIT .clstr file',
117
- required=True)
118
- required.add_argument('-output', action="store", dest="output",
119
- help="Output TSV file to store cluster summaries - Will add '.tsv' if not provided by user",
120
- required=True)
121
- required.add_argument('-genome_num', action='store', dest='genome_num', type=int,
122
- help='The total number of genomes must be provide',
123
- required=True)
124
- #required.add_argument("-clustering_format", action="store", dest="clustering_format", choices=['CD-HIT','TSV','CSV'],
125
- # help="Clustering format to use: CD-HIT or TSV (MMseqs2, BLAST, DIAMOND) / CSV edge-list file (Node1\tNode2).",
126
- # required=True)
127
-
128
- optional = parser.add_argument_group('Optional Arguments')
129
- optional.add_argument('-output_dir', action="store", dest="output_dir",
130
- help='Default: Same as input file',
131
- required=False)
132
-
133
- misc = parser.add_argument_group("Misc Parameters")
134
- misc.add_argument("-verbose", action="store_true", dest="verbose",
135
- help="Print verbose output.",
136
- required=False)
137
- misc.add_argument("-v", "--version", action="version",
138
- version=f"PyamilySeq: Group-Summary version {PyamilySeq_Version} - Exiting",
139
- help="Print out version number and exit")
140
-
141
-
142
- options = parser.parse_args()
143
- print("Running PyamilySeq " + PyamilySeq_Version+ ": Group-Summary ")
144
-
145
- ### File handling
146
- options.input_clstr = fix_path(options.input_clstr)
147
- if options.output_dir is None:
148
- options.output_dir = os.path.dirname(os.path.abspath(options.input_clstr))
149
- output_path = os.path.abspath(options.output_dir)
150
- if not os.path.exists(output_path):
151
- os.makedirs(output_path)
152
- output_name = options.output
153
- if not output_name.endswith('.tsv'):
154
- output_name += '.tsv'
155
- output_file_path = os.path.join(output_path, output_name)
156
- ###
157
-
158
- clusters = read_cd_hit_output(options.input_clstr)
159
- summarise_clusters(options,clusters, output_file_path)
160
-
161
-
162
- if __name__ == "__main__":
163
- main()
PyamilySeq/Constants.py DELETED
@@ -1,2 +0,0 @@
1
- PyamilySeq_Version = 'v0.9.0'
2
-
@@ -1,382 +0,0 @@
1
- import collections
2
- import subprocess
3
- import os
4
- import argparse
5
- from collections import defaultdict, OrderedDict
6
- from line_profiler_pycharm import profile
7
-
8
- try:
9
- from .Constants import *
10
- from .utils import *
11
- except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
12
- from Constants import *
13
- from utils import *
14
-
15
- def run_cd_hit(options, input_file, clustering_output, clustering_mode):
16
- cdhit_command = [
17
- clustering_mode,
18
- '-i', input_file,
19
- '-o', clustering_output,
20
- '-c', str(options.pident),
21
- '-s', str(options.len_diff),
22
- '-T', str(options.clustering_threads),
23
- '-M', str(options.clustering_memory),
24
- '-d', "0",
25
- '-g', "1",
26
- '-sc', "1",
27
- '-sf', "1"
28
- ]
29
- if options.verbose:
30
- subprocess.run(cdhit_command)
31
- else:
32
- subprocess.run(cdhit_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
33
-
34
- @profile
35
- def calculate_new_rep_seq(cluster_data, length_weight=1.0, identity_weight=1.0):
36
- total_length = sum(entry['length'] for entry in cluster_data)
37
- avg_length = total_length / len(cluster_data)
38
-
39
- total_identity = sum(entry['percent_identity'] for entry in cluster_data)
40
- avg_identity = total_identity / len(cluster_data)
41
-
42
- # Normalize length and identity
43
- max_length = max(entry['length'] for entry in cluster_data)
44
- max_identity = 100 # Assuming percent_identity is out of 100
45
-
46
- # Calculate a score based on both length difference and percent identity
47
- def score(entry):
48
- normalized_length_diff = abs(entry['length'] - avg_length) / max_length
49
- normalized_identity_diff = abs(entry['percent_identity'] - avg_identity) / max_identity
50
- return (length_weight * normalized_length_diff) + (identity_weight * (1 - normalized_identity_diff))
51
-
52
- rep_entry = min(cluster_data, key=score)
53
- return rep_entry
54
-
55
-
56
-
57
- def length_within_threshold(rep_length, length, len_diff):
58
- return abs(rep_length - length) / rep_length <= len_diff
59
-
60
-
61
- def check_if_all_identical(clustered_sequences):
62
- lengths = {entry['length'] for cluster in clustered_sequences.values() for entry in cluster}
63
- perc_idents = {entry['percent_identity'] for cluster in clustered_sequences.values() for entry in cluster}
64
-
65
- return len(lengths) == 1 and len(perc_idents) == 1
66
-
67
-
68
-
69
- def read_fasta_groups(options):
70
- groups = defaultdict(list)
71
- genome_count = defaultdict(int)
72
- current_group = None
73
- current_sequence = []
74
-
75
- # Parse the list of specific group numbers if provided
76
- selected_groups = None
77
- if options.groups is not None:
78
- selected_groups = [int(g.strip()) for g in options.groups.split(',')]
79
-
80
- with open(options.input_fasta, 'r') as f:
81
- for line in f:
82
- if line.startswith('>'):
83
- if current_group is not None and (selected_groups is None or group_number in selected_groups):
84
- groups[current_group].append((current_group_header, ''.join(current_sequence)))
85
-
86
- current_group_header = line.strip()
87
- current_group = current_group_header.split('|')[0]
88
- genome = current_group_header.split('|')[1]
89
- current_sequence = []
90
- genome_count[genome] += 1
91
-
92
- # Only process if group matches the selected_groups or if no specific groups were provided
93
- group_number = int(current_group.replace('>Group_', '')) # Assuming format 'Group_n'
94
- if selected_groups is not None and group_number not in selected_groups:
95
- current_group = None # Skip this group
96
- continue
97
-
98
- else:
99
- current_sequence.append(line.strip())
100
-
101
- if current_group is not None:
102
- groups[current_group].append((current_group_header, ''.join(current_sequence)))
103
-
104
- return groups, genome_count
105
-
106
-
107
- def write_fasta(sequences, output_file):
108
- with open(output_file, 'w') as f:
109
- for header, seq in sequences:
110
- f.write(f"{header}\n{seq}\n")
111
-
112
-
113
- def read_cd_hit_output(clustering_output):
114
- clusters = OrderedDict()
115
-
116
- with open(clustering_output, 'r') as f:
117
- current_cluster_id = None
118
-
119
- for line in f:
120
- line = line.strip()
121
- if line.startswith(">Cluster"):
122
- current_cluster_id = line.split(' ')[1]
123
- clusters[current_cluster_id] = []
124
- elif line and current_cluster_id is not None:
125
- parts = line.split('\t')
126
- if len(parts) > 1:
127
- clustered_info = parts[1]
128
- length = clustered_info.split(',')[0]
129
- length = int(''.join(c for c in length if c.isdigit()))
130
- clustered_header = clustered_info.split('>')[1].split('...')[0]
131
- clustered_header = '>' + clustered_header
132
-
133
- if 'at ' in clustered_info and '%' in clustered_info.split('at ')[-1]:
134
- percent_identity = extract_identity(line)
135
- elif line.endswith('*'):
136
- percent_identity = 100.0
137
- else:
138
- raise ValueError("Percent identity not found in the string.")
139
-
140
- clusters[current_cluster_id].append({
141
- 'header': clustered_header,
142
- 'length': length,
143
- 'percent_identity': percent_identity
144
- })
145
-
146
- return clusters
147
-
148
- @profile
149
- def separate_groups(options, clustering_mode):
150
- groups, genome_count = read_fasta_groups(options)
151
-
152
- paralog_groups = defaultdict(int) # To track number of paralog groups
153
-
154
- for group_header, sequences in groups.items():
155
- if options.verbose:
156
- print(f"\n###\nCurrent Group: {group_header.replace('>','')}\n")
157
-
158
- group_name = group_header.split('|')[0] # Get the group part (e.g., '>Group_n')
159
-
160
- # Count genomes with more than one gene
161
- genome_to_gene_count = defaultdict(int)
162
- for header, _ in sequences:
163
- genome = header.split('|')[1]
164
- genome_to_gene_count[genome] += 1
165
-
166
- num_genomes_with_multiple_genes = sum(1 for count in genome_to_gene_count.values() if count > 1)
167
-
168
- # Check if the group meets the threshold for having paralogs
169
- if options.groups == None:
170
- if (num_genomes_with_multiple_genes / options.genome_num) * 100 < options.group_threshold:
171
- continue
172
-
173
-
174
- group_file_name = group_name.replace('>','')
175
-
176
- temp_fasta = f"{options.output_dir}/{group_file_name}.fasta"
177
- write_fasta(sequences, temp_fasta)
178
-
179
- # Run cd-hit on the individual group
180
- clustering_output = f"{options.output_dir}/{group_file_name}_clustering"
181
-
182
- run_cd_hit(options, temp_fasta, clustering_output, clustering_mode)
183
-
184
- # Read the clustering results to find subgroups
185
- clustered_sequences = read_cd_hit_output(clustering_output + '.clstr')
186
-
187
- if len(clustered_sequences) == 1:
188
- # Detect if all sequences are identical in length and percentage identity
189
- all_same = check_if_all_identical(clustered_sequences)
190
-
191
- # **Global subgroup counter for the entire major group**
192
- subgroup_id = 0
193
-
194
-
195
- if not all_same:
196
- # Iterate through each cluster in clustered_sequences
197
- for cluster_key, cluster in clustered_sequences.items():
198
-
199
- remaining_sequences_tmp = sequences.copy() # Track unprocessed sequences
200
- remaining_sequences = [entry for entry in remaining_sequences_tmp if entry[0] in
201
- {seq_entry['header'] for seq_entry in cluster}]
202
- sequences_to_remove = []
203
-
204
- while remaining_sequences:
205
- # Track subgroups for this cluster pass
206
- subgroup_sequences = []
207
- genome_seen = set()
208
-
209
- # Recalculate representative sequence dynamically for this cluster
210
- rep = calculate_new_rep_seq(
211
- [entry for entry in cluster if entry['header'] in (h for h, _ in remaining_sequences)]
212
- )
213
-
214
- # Find the sequence corresponding to rep['header'] from the list of sequences
215
- rep_seq = next((seq for header, seq in sequences if header == rep['header']), None)
216
-
217
- # Save previously checked seqs, so we don't have to compare them again.
218
- checked = collections.defaultdict(float)
219
-
220
- # Process each genome to select the best matching sequence
221
- for genome in genome_to_gene_count:
222
- best_sequence = None
223
- best_score = None # Initialise with a very low score, so that even negative scores can be selected
224
-
225
- # Iterate over each sequence in the remaining sequences for this genome
226
- for header, seq in remaining_sequences:
227
- genome_id = header.split('|')[1]
228
-
229
- if genome_id == genome: # Ensure this sequence belongs to the current genome
230
- if rep_seq == seq:
231
- levenshtein_distance = 0
232
- else:
233
- if seq in checked:
234
- levenshtein_distance = checked[seq]
235
- else:
236
- levenshtein_distance = levenshtein_distance_calc(rep_seq,seq)
237
- checked[seq] = levenshtein_distance
238
- # Lower Levenshtein distance means more 'similar' sequences
239
- score = levenshtein_distance
240
-
241
- # Check if this sequence has a higher score than the current best
242
- if best_sequence == None:
243
- best_score = score
244
- best_sequence = (header, seq) # Store the best matching sequence for this genome
245
- elif score < best_score:
246
- best_score = score
247
- best_sequence = (header, seq) # Store the best matching sequence for this genome
248
-
249
- # Add the best sequence for this genome to the subgroup
250
- if best_sequence is not None:
251
- new_header = f">{group_file_name}_subgroup_{subgroup_id}|{best_sequence[0].split('|')[1]}|{best_sequence[0].split('|')[2]}"
252
- subgroup_sequences.append((new_header, best_sequence[1]))
253
- sequences_to_remove.append(best_sequence)
254
- genome_seen.add(genome)
255
-
256
- # Write each subgroup into a separate FASTA file
257
- if subgroup_sequences:
258
- subgroup_file = f"{options.output_dir}/{group_file_name}_subgroup_{subgroup_id}.fasta"
259
- write_fasta(subgroup_sequences, subgroup_file)
260
-
261
- # Remove processed sequences from the remaining list
262
- remaining_sequences = [item for item in remaining_sequences if
263
- item[0] not in {h for h, _ in sequences_to_remove}]
264
-
265
- # Increment subgroup ID for the next subgroup
266
- subgroup_id += 1
267
- paralog_groups[group_name] += 1 # Count this group as a paralog group
268
-
269
-
270
-
271
-
272
- else:
273
- # Condition 2: If sequences are identical, distribute genes evenly into subgroups
274
- num_subgroups = 1000
275
- subgroup_sequences = defaultdict(list) # Store sequences for each subgroup
276
- genome_count = defaultdict(int) # Count how many genes have been assigned to each genome
277
-
278
- # Iterate over all sequences regardless of whether the genome has been seen
279
- for header, seq in sequences:
280
- genome = header.split('|')[1]
281
-
282
- # Determine the next subgroup for this genome
283
- subgroup_id = genome_count[genome] % num_subgroups
284
- new_header = f"{group_file_name}_subgroup_{subgroup_id}|{genome}|{header.split('|')[2]}"
285
- subgroup_sequences[subgroup_id].append((new_header, seq))
286
-
287
- # Increment the count for this genome
288
- genome_count[genome] += 1
289
-
290
- # Write out each subgroup to a separate FASTA file
291
- for subgroup_id, seqs in subgroup_sequences.items():
292
- subgroup_file = f"{options.output_dir}/{group_file_name}_subgroup_{subgroup_id}.fasta"
293
- write_fasta(seqs, subgroup_file)
294
-
295
- # Increment subgroup ID globally for the next subgroup
296
- subgroup_id += 1
297
- paralog_groups[group_name] += 1 # Count this group as a paralog group
298
-
299
-
300
-
301
- # Clean up temporary fasta file if the option is set
302
- if options.delete_temp_files:
303
- if temp_fasta and os.path.exists(temp_fasta):
304
- os.remove(temp_fasta)
305
- if os.path.exists(clustering_output + '.clstr'):
306
- os.remove(clustering_output + '.clstr')
307
- if os.path.exists(clustering_output):
308
- os.remove(clustering_output)
309
-
310
- # Print metrics about paralog groups
311
- print(f"Identified {len(paralog_groups)} paralog groups:")
312
- for group_id, count in paralog_groups.items():
313
- print(f"Group ID: {group_id}, Number of new groups: {count}")
314
-
315
-
316
- def main():
317
- parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': Group-Splitter - A tool to split multi-copy gene groups identified by PyamilySeq.')
318
- ### Required Arguments
319
- required = parser.add_argument_group('Required Parameters')
320
- required.add_argument('-input_fasta', action='store', dest='input_fasta',
321
- help='Input FASTA file containing gene groups.',
322
- required=True)
323
- required.add_argument('-sequence_type', action='store', dest='sequence_type', default='DNA',choices=['AA', 'DNA'],
324
- help='Default - DNA: Are groups "DNA" or "AA" sequences?',
325
- required=True)
326
- required.add_argument('-genome_num', action='store', dest='genome_num', type=int,
327
- help='The total number of genomes must be provide',
328
- required=True)
329
- required.add_argument('-output_dir', action='store', dest='output_dir',
330
- help='Output directory.',
331
- required=True)
332
-
333
- regrouping_params = parser.add_argument_group('Regrouping Parameters')
334
- regrouping_params.add_argument('-groups', action="store", dest='groups', default=None,
335
- help='Default - auto: Detect groups to be split (see -group_threshold). '
336
- 'Provide "-groups 1,2,3,4" with group IDs to split specific groups.',
337
- required=False)
338
- regrouping_params.add_argument('-group_threshold', action='store', dest='group_threshold', type=float, default=80,
339
- help='Minimum percentage of genomes with multi-copy (default: 80.0) - Does not work with "-groups"')
340
-
341
- cdhit_params = parser.add_argument_group('CD-HIT Reclustering Parameters')
342
- cdhit_params.add_argument('-c', action='store', dest='pident', type=float, default=0.8,
343
- help='Sequence identity threshold (default: 0.8) - Probably should be higher than what was used in initial clustering.')
344
- cdhit_params.add_argument('-s', action='store', dest='len_diff', type=float, default=0.20,
345
- help="Length difference cutoff (default: 0.20) - Often the most impactful parameter to split 'multi-copy' gene groups.")
346
- cdhit_params.add_argument('-T', action='store', dest='clustering_threads', type=int, default=4,
347
- help='Number of threads for clustering (default: 4)')
348
- cdhit_params.add_argument('-M', action='store', dest='clustering_memory', type=int, default=2000,
349
- help='Memory limit in MB for clustering (default: 2000)')
350
-
351
-
352
- misc = parser.add_argument_group("Misc Parameters")
353
- misc.add_argument('-no_delete_temp_files', action='store_false', dest='delete_temp_files',
354
- help='Default: Delete all temporary files after processing.',
355
- required=False)
356
- misc.add_argument("-verbose", action="store_true", dest="verbose" ,
357
- help="Print verbose output.",
358
- required=False)
359
- misc.add_argument("-v", "--version", action="version",
360
- version=f"PyamilySeq: Group-Splitter version {PyamilySeq_Version} - Exiting",
361
- help="Print out version number and exit")
362
-
363
-
364
- options = parser.parse_args()
365
- print("Running PyamilySeq: Group-Splitter " + PyamilySeq_Version)
366
-
367
-
368
-
369
- if not os.path.exists(options.output_dir):
370
- os.makedirs(options.output_dir)
371
-
372
- if options.sequence_type == 'DNA':
373
- clustering_mode = 'cd-hit-est'
374
- else:
375
- clustering_mode = 'cd-hit'
376
-
377
- separate_groups(options, clustering_mode)
378
-
379
-
380
- if __name__ == "__main__":
381
-
382
- main()