PyamilySeq 0.7.1__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyamilySeq/Constants.py +1 -1
- PyamilySeq/Group_Splitter.py +350 -0
- PyamilySeq/PyamilySeq.py +12 -20
- PyamilySeq/PyamilySeq_Genus.py +1 -1
- PyamilySeq/PyamilySeq_Species.py +3 -2
- PyamilySeq/utils.py +128 -25
- {PyamilySeq-0.7.1.dist-info → PyamilySeq-0.8.1.dist-info}/METADATA +45 -8
- PyamilySeq-0.8.1.dist-info/RECORD +15 -0
- {PyamilySeq-0.7.1.dist-info → PyamilySeq-0.8.1.dist-info}/WHEEL +1 -1
- {PyamilySeq-0.7.1.dist-info → PyamilySeq-0.8.1.dist-info}/entry_points.txt +1 -0
- PyamilySeq-0.7.1.dist-info/RECORD +0 -14
- {PyamilySeq-0.7.1.dist-info → PyamilySeq-0.8.1.dist-info}/LICENSE +0 -0
- {PyamilySeq-0.7.1.dist-info → PyamilySeq-0.8.1.dist-info}/top_level.txt +0 -0
PyamilySeq/Constants.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
PyamilySeq_Version = 'v0.
|
|
1
|
+
PyamilySeq_Version = 'v0.8.1'
|
|
2
2
|
|
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
import os
|
|
3
|
+
import argparse
|
|
4
|
+
from collections import defaultdict, OrderedDict
|
|
5
|
+
from line_profiler_pycharm import profile
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
from .Constants import *
|
|
9
|
+
from .utils import *
|
|
10
|
+
except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
|
|
11
|
+
from Constants import *
|
|
12
|
+
from utils import *
|
|
13
|
+
|
|
14
|
+
def run_cd_hit(options, input_file, clustering_output, clustering_mode):
|
|
15
|
+
cdhit_command = [
|
|
16
|
+
clustering_mode,
|
|
17
|
+
'-i', input_file,
|
|
18
|
+
'-o', clustering_output,
|
|
19
|
+
'-c', str(options.pident),
|
|
20
|
+
'-s', str(options.len_diff),
|
|
21
|
+
'-T', str(options.clustering_threads),
|
|
22
|
+
'-M', str(options.clustering_memory),
|
|
23
|
+
'-d', "0",
|
|
24
|
+
'-sc', "1",
|
|
25
|
+
'-sf', "1"
|
|
26
|
+
]
|
|
27
|
+
if options.verbose:
|
|
28
|
+
subprocess.run(cdhit_command)
|
|
29
|
+
else:
|
|
30
|
+
subprocess.run(cdhit_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def calculate_new_rep_seq(cluster_data):
|
|
34
|
+
total_length = sum(entry['length'] for entry in cluster_data)
|
|
35
|
+
avg_length = total_length / len(cluster_data)
|
|
36
|
+
|
|
37
|
+
total_identity = sum(entry['percent_identity'] for entry in cluster_data)
|
|
38
|
+
avg_identity = total_identity / len(cluster_data)
|
|
39
|
+
|
|
40
|
+
# Calculate a score based on both length difference and percent identity
|
|
41
|
+
def score(entry):
|
|
42
|
+
length_diff = abs(entry['length'] - avg_length)
|
|
43
|
+
identity_diff = abs(entry['percent_identity'] - avg_identity)
|
|
44
|
+
return length_diff + (100 - identity_diff) # You can weight these differently
|
|
45
|
+
|
|
46
|
+
rep_entry = min(cluster_data, key=score)
|
|
47
|
+
return rep_entry
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def length_within_threshold(rep_length, length, len_diff):
|
|
51
|
+
return abs(rep_length - length) / rep_length <= len_diff
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def check_if_all_identical(clustered_sequences):
|
|
55
|
+
lengths = {entry['length'] for cluster in clustered_sequences.values() for entry in cluster}
|
|
56
|
+
perc_idents = {entry['percent_identity'] for cluster in clustered_sequences.values() for entry in cluster}
|
|
57
|
+
|
|
58
|
+
return len(lengths) == 1 and len(perc_idents) == 1
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def read_fasta_groups(fasta_file):
|
|
62
|
+
groups = defaultdict(list)
|
|
63
|
+
genome_count = defaultdict(int)
|
|
64
|
+
current_group = None
|
|
65
|
+
current_sequence = []
|
|
66
|
+
|
|
67
|
+
with open(fasta_file, 'r') as f:
|
|
68
|
+
for line in f:
|
|
69
|
+
if line.startswith('>'):
|
|
70
|
+
if current_group is not None:
|
|
71
|
+
groups[current_group].append((current_group_header, ''.join(current_sequence)))
|
|
72
|
+
|
|
73
|
+
current_group_header = line.strip()
|
|
74
|
+
current_group = current_group_header.split('|')[0]
|
|
75
|
+
genome = current_group_header.split('|')[1]
|
|
76
|
+
current_sequence = []
|
|
77
|
+
genome_count[genome] += 1
|
|
78
|
+
else:
|
|
79
|
+
current_sequence.append(line.strip())
|
|
80
|
+
|
|
81
|
+
if current_group is not None:
|
|
82
|
+
groups[current_group].append((current_group_header, ''.join(current_sequence)))
|
|
83
|
+
|
|
84
|
+
return groups, genome_count
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def write_fasta(sequences, output_file):
|
|
88
|
+
with open(output_file, 'w') as f:
|
|
89
|
+
for header, seq in sequences:
|
|
90
|
+
f.write(f"{header}\n{seq}\n")
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def read_cd_hit_output(clustering_output):
|
|
94
|
+
clusters = OrderedDict()
|
|
95
|
+
|
|
96
|
+
with open(clustering_output, 'r') as f:
|
|
97
|
+
current_cluster_id = None
|
|
98
|
+
|
|
99
|
+
for line in f:
|
|
100
|
+
line = line.strip()
|
|
101
|
+
if line.startswith(">Cluster"):
|
|
102
|
+
current_cluster_id = line.split(' ')[1]
|
|
103
|
+
clusters[current_cluster_id] = []
|
|
104
|
+
elif line and current_cluster_id is not None:
|
|
105
|
+
parts = line.split('\t')
|
|
106
|
+
if len(parts) > 1:
|
|
107
|
+
clustered_info = parts[1]
|
|
108
|
+
length = clustered_info.split(',')[0]
|
|
109
|
+
length = int(''.join(c for c in length if c.isdigit()))
|
|
110
|
+
clustered_header = clustered_info.split('>')[1].split('...')[0]
|
|
111
|
+
clustered_header = '>' + clustered_header
|
|
112
|
+
|
|
113
|
+
if 'at' in clustered_info:
|
|
114
|
+
percent_identity = extract_identity(line)
|
|
115
|
+
|
|
116
|
+
elif '*' in line:
|
|
117
|
+
percent_identity = 100.0
|
|
118
|
+
else:
|
|
119
|
+
raise ValueError("Percent identity not found in the string.")
|
|
120
|
+
|
|
121
|
+
clusters[current_cluster_id].append({
|
|
122
|
+
'header': clustered_header,
|
|
123
|
+
'length': length,
|
|
124
|
+
'percent_identity': percent_identity
|
|
125
|
+
})
|
|
126
|
+
|
|
127
|
+
return clusters
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def separate_groups(input_fasta, options, clustering_mode):
|
|
131
|
+
groups, genome_count = read_fasta_groups(input_fasta)
|
|
132
|
+
|
|
133
|
+
paralog_groups = defaultdict(int) # To track number of paralog groups
|
|
134
|
+
|
|
135
|
+
for group_header, sequences in groups.items():
|
|
136
|
+
group_name = group_header.split('|')[0] # Get the group part (e.g., '>Group_n')
|
|
137
|
+
|
|
138
|
+
# Count genomes with more than one gene
|
|
139
|
+
genome_to_gene_count = defaultdict(int)
|
|
140
|
+
for header, _ in sequences:
|
|
141
|
+
genome = header.split('|')[1]
|
|
142
|
+
genome_to_gene_count[genome] += 1
|
|
143
|
+
|
|
144
|
+
num_genomes_with_multiple_genes = sum(1 for count in genome_to_gene_count.values() if count > 1)
|
|
145
|
+
total_genomes = len(genome_to_gene_count)
|
|
146
|
+
|
|
147
|
+
# Check if the group meets the threshold for having paralogs
|
|
148
|
+
if total_genomes == 0 or (num_genomes_with_multiple_genes / total_genomes) * 100 < options.percent_threshold:
|
|
149
|
+
continue
|
|
150
|
+
|
|
151
|
+
group_file_name = group_name.replace('>','')
|
|
152
|
+
|
|
153
|
+
temp_fasta = f"{options.output_dir}/{group_file_name}.fasta"
|
|
154
|
+
write_fasta(sequences, temp_fasta)
|
|
155
|
+
|
|
156
|
+
# Run cd-hit on the individual group
|
|
157
|
+
clustering_output = f"{options.output_dir}/{group_file_name}_clustering"
|
|
158
|
+
|
|
159
|
+
run_cd_hit(options, temp_fasta, clustering_output, clustering_mode)
|
|
160
|
+
|
|
161
|
+
# Read the clustering results to find subgroups
|
|
162
|
+
clustered_sequences = read_cd_hit_output(clustering_output + '.clstr')
|
|
163
|
+
|
|
164
|
+
# Detect if all sequences are identical in length and percentage identity
|
|
165
|
+
all_same = check_if_all_identical(clustered_sequences)
|
|
166
|
+
|
|
167
|
+
# **Global subgroup counter for the entire major group**
|
|
168
|
+
subgroup_id = 0
|
|
169
|
+
remaining_sequences = sequences.copy() # Track unprocessed sequences
|
|
170
|
+
sequences_to_remove = []
|
|
171
|
+
|
|
172
|
+
if not all_same:
|
|
173
|
+
while remaining_sequences:
|
|
174
|
+
# Track subgroups for this pass
|
|
175
|
+
subgroup_sequences = []
|
|
176
|
+
genome_seen = set()
|
|
177
|
+
sequences_found = False # Track if any sequence was added
|
|
178
|
+
|
|
179
|
+
# Recalculate representative sequence dynamically based on remaining genes
|
|
180
|
+
rep = calculate_new_rep_seq(
|
|
181
|
+
[entry for cluster in clustered_sequences.values() for entry in cluster if
|
|
182
|
+
entry['header'] in (h for h, _ in remaining_sequences)]
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
# Find the sequence corresponding to rep['header'] from the list of sequences
|
|
186
|
+
rep_seq = next((seq for header, seq in sequences if header == rep['header']), None)
|
|
187
|
+
|
|
188
|
+
# Process each genome to select the best matching sequence
|
|
189
|
+
for genome in genome_to_gene_count:
|
|
190
|
+
best_sequence = None
|
|
191
|
+
best_score = -1 # Initialize with a very low similarity score
|
|
192
|
+
|
|
193
|
+
# Iterate over each sequence in the remaining sequences for this genome
|
|
194
|
+
for header, seq in remaining_sequences:
|
|
195
|
+
genome_id = header.split('|')[1]
|
|
196
|
+
|
|
197
|
+
if genome_id == genome: # Ensure this sequence belongs to the current genome
|
|
198
|
+
|
|
199
|
+
length = len(seq)
|
|
200
|
+
if rep_seq == seq:
|
|
201
|
+
perc_ident = 100.0
|
|
202
|
+
else:
|
|
203
|
+
perc_ident = calculate_similarity(rep_seq, seq) # Define a function to calculate similarity
|
|
204
|
+
|
|
205
|
+
# Calculate the length difference ratio (smaller ratio means closer length to the representative)
|
|
206
|
+
length_diff_ratio = abs(rep['length'] - length) / rep['length']
|
|
207
|
+
|
|
208
|
+
# Check if this sequence is more similar than the current best one
|
|
209
|
+
if length_within_threshold(rep['length'], length,
|
|
210
|
+
options.len_diff) and perc_ident >= options.pident:
|
|
211
|
+
|
|
212
|
+
# Combine percentage identity and length difference into a single score
|
|
213
|
+
# Here, you want a high identity and a small length difference
|
|
214
|
+
# Adjust the weight of length difference and similarity according to your requirements
|
|
215
|
+
score = perc_ident - (length_diff_ratio * 100) # Weighting length diff (you can adjust the *100 factor)
|
|
216
|
+
|
|
217
|
+
# Check if this sequence has a higher score than the current best
|
|
218
|
+
if score > best_score:
|
|
219
|
+
best_score = score
|
|
220
|
+
best_sequence = (header, seq) # Store the best matching sequence for this genome
|
|
221
|
+
|
|
222
|
+
# Once the best sequence is identified, add it to the subgroup
|
|
223
|
+
if best_sequence is not None:
|
|
224
|
+
sequences_found = True # At least one sequence was added
|
|
225
|
+
new_header = f">{group_file_name}_subgroup_{subgroup_id}|{best_sequence[0].split('|')[1]}|{best_sequence[0].split('|')[2]}"
|
|
226
|
+
subgroup_sequences.append((new_header, best_sequence[1]))
|
|
227
|
+
sequences_to_remove.append(best_sequence)
|
|
228
|
+
genome_seen.add(genome)
|
|
229
|
+
|
|
230
|
+
# If no sequences were found for this pass, exit the loop
|
|
231
|
+
# if not sequences_found:
|
|
232
|
+
# break
|
|
233
|
+
|
|
234
|
+
# Write each subgroup into a separate FASTA file
|
|
235
|
+
if subgroup_sequences:
|
|
236
|
+
subgroup_file = f"{options.output_dir}/{group_file_name}_subgroup_{subgroup_id}.fasta"
|
|
237
|
+
write_fasta(subgroup_sequences, subgroup_file)
|
|
238
|
+
|
|
239
|
+
# Remove processed sequences from the remaining list
|
|
240
|
+
remaining_sequences = [item for item in remaining_sequences if
|
|
241
|
+
item[0] not in {h for h, _ in sequences_to_remove}]
|
|
242
|
+
|
|
243
|
+
# Increment subgroup ID globally for the next subgroup
|
|
244
|
+
subgroup_id += 1
|
|
245
|
+
paralog_groups[group_name] += 1 # Count this group as a paralog group
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
else:
|
|
249
|
+
# Condition 2: If sequences are identical, distribute genes evenly into subgroups
|
|
250
|
+
num_subgroups = 1000
|
|
251
|
+
subgroup_sequences = defaultdict(list) # Store sequences for each subgroup
|
|
252
|
+
genome_count = defaultdict(int) # Count how many genes have been assigned to each genome
|
|
253
|
+
|
|
254
|
+
# Iterate over all sequences regardless of whether the genome has been seen
|
|
255
|
+
for header, seq in sequences:
|
|
256
|
+
genome = header.split('|')[1]
|
|
257
|
+
|
|
258
|
+
# Determine the next subgroup for this genome
|
|
259
|
+
subgroup_id = genome_count[genome] % num_subgroups
|
|
260
|
+
new_header = f"{group_file_name}_subgroup_{subgroup_id}|{genome}|{header.split('|')[2]}"
|
|
261
|
+
subgroup_sequences[subgroup_id].append((new_header, seq))
|
|
262
|
+
|
|
263
|
+
# Increment the count for this genome
|
|
264
|
+
genome_count[genome] += 1
|
|
265
|
+
|
|
266
|
+
# Write out each subgroup to a separate FASTA file
|
|
267
|
+
for subgroup_id, seqs in subgroup_sequences.items():
|
|
268
|
+
subgroup_file = f"{options.output_dir}/{group_file_name}_subgroup_{subgroup_id}.fasta"
|
|
269
|
+
write_fasta(seqs, subgroup_file)
|
|
270
|
+
|
|
271
|
+
# Increment subgroup ID globally for the next subgroup
|
|
272
|
+
subgroup_id += 1
|
|
273
|
+
paralog_groups[group_name] += 1 # Count this group as a paralog group
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
# Clean up temporary fasta file if the option is set
|
|
278
|
+
if options.delete_temp_files:
|
|
279
|
+
if temp_fasta and os.path.exists(temp_fasta):
|
|
280
|
+
os.remove(temp_fasta)
|
|
281
|
+
if os.path.exists(clustering_output + '.clstr'):
|
|
282
|
+
os.remove(clustering_output + '.clstr')
|
|
283
|
+
if os.path.exists(clustering_output):
|
|
284
|
+
os.remove(clustering_output)
|
|
285
|
+
|
|
286
|
+
# Print metrics about paralog groups
|
|
287
|
+
print(f"Identified {len(paralog_groups)} paralog groups:")
|
|
288
|
+
for group_id, count in paralog_groups.items():
|
|
289
|
+
print(f"Group ID: {group_id}, Number of new groups: {count}")
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def main():
|
|
293
|
+
parser = argparse.ArgumentParser(description='Group-Splitter: ' + PyamilySeq_Version + ': A tool to split "paralogous" groups identified by PyamilySeq.')
|
|
294
|
+
### Required Arguments
|
|
295
|
+
required = parser.add_argument_group('Required Arguments')
|
|
296
|
+
required.add_argument('-input_fasta', action='store', dest='input_fasta',
|
|
297
|
+
help='Input FASTA file containing gene groups.',
|
|
298
|
+
required=True)
|
|
299
|
+
required.add_argument('-sequence_type', action='store', dest='sequence_type', default='DNA',choices=['AA', 'DNA'],
|
|
300
|
+
help='Default - DNA: Are groups "DNA" or "AA" sequences?',
|
|
301
|
+
required=False)
|
|
302
|
+
required.add_argument('-output_dir', action='store', dest='output_dir',
|
|
303
|
+
help='Output directory.',
|
|
304
|
+
required=True)
|
|
305
|
+
|
|
306
|
+
optional = parser.add_argument_group('Optional Arguments')
|
|
307
|
+
|
|
308
|
+
optional.add_argument('-pident', action='store', dest='pident', type=float, default=0.9,
|
|
309
|
+
help='Sequence identity threshold (default: 0.9)')
|
|
310
|
+
optional.add_argument('-len_diff', action='store', dest='len_diff', type=float, default=0.05,
|
|
311
|
+
help='Length difference threshold (default: 0.05)')
|
|
312
|
+
optional.add_argument('-clustering_threads', action='store', dest='clustering_threads', type=int, default=4,
|
|
313
|
+
help='Number of threads for clustering (default: 4)')
|
|
314
|
+
optional.add_argument('-clustering_memory', action='store', dest='clustering_memory', type=int, default=2000,
|
|
315
|
+
help='Memory limit in MB for clustering (default: 2000)')
|
|
316
|
+
optional.add_argument('-percent_threshold', action='store', dest='percent_threshold', type=float, default=80,
|
|
317
|
+
help='Minimum percentage of genomes with paralogs (default: 80.0)')
|
|
318
|
+
optional.add_argument('-verbose', action='store_true', dest='verbose', help='Print verbose output.')
|
|
319
|
+
optional.add_argument('-no_delete_temp_files', action='store_false', dest='delete_temp_files',
|
|
320
|
+
help='Default: Delete all temporary files after processing.')
|
|
321
|
+
|
|
322
|
+
misc = parser.add_argument_group('Misc Arguments')
|
|
323
|
+
misc.add_argument('-v', action='store_true', dest='version',
|
|
324
|
+
help='Print out version number and exit',
|
|
325
|
+
required=False)
|
|
326
|
+
|
|
327
|
+
options = parser.parse_args()
|
|
328
|
+
|
|
329
|
+
# Check for version flag
|
|
330
|
+
if options.version:
|
|
331
|
+
print(f"Group-Splitter version {PyamilySeq_Version}")
|
|
332
|
+
exit(0)
|
|
333
|
+
|
|
334
|
+
options = parser.parse_args()
|
|
335
|
+
|
|
336
|
+
if not os.path.exists(options.output_dir):
|
|
337
|
+
os.makedirs(options.output_dir)
|
|
338
|
+
|
|
339
|
+
if options.sequence_type == 'DNA':
|
|
340
|
+
clustering_mode = 'cd-hit-est'
|
|
341
|
+
else:
|
|
342
|
+
clustering_mode = 'cd-hit'
|
|
343
|
+
|
|
344
|
+
separate_groups(options.input_fasta, options, clustering_mode)
|
|
345
|
+
|
|
346
|
+
print("Done")
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
if __name__ == "__main__":
|
|
350
|
+
main()
|
PyamilySeq/PyamilySeq.py
CHANGED
|
@@ -27,7 +27,7 @@ def run_cd_hit(options, input_file, clustering_output, clustering_mode):
|
|
|
27
27
|
'-o', clustering_output,
|
|
28
28
|
'-c', str(options.pident),
|
|
29
29
|
'-s', str(options.len_diff),
|
|
30
|
-
'-T', str(options.
|
|
30
|
+
'-T', str(options.threads),
|
|
31
31
|
'-M', str(options.clustering_memory),
|
|
32
32
|
'-d', "0",
|
|
33
33
|
'-sc', "1",
|
|
@@ -41,7 +41,6 @@ def run_cd_hit(options, input_file, clustering_output, clustering_mode):
|
|
|
41
41
|
|
|
42
42
|
def main():
|
|
43
43
|
parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': A tool that groups genes into unique clusters.')
|
|
44
|
-
vparser = argparse.ArgumentParser()
|
|
45
44
|
### Required Arguments
|
|
46
45
|
required = parser.add_argument_group('Required Arguments')
|
|
47
46
|
required.add_argument('-run_mode', action='store', dest='run_mode', choices=['Full','Partial'],
|
|
@@ -85,8 +84,8 @@ def main():
|
|
|
85
84
|
clustering_args.add_argument("-mem", action="store", dest="clustering_memory", type=int, default=4000,
|
|
86
85
|
help="Default 4000: Memory to be allocated for clustering (in MBs).",
|
|
87
86
|
required=False)
|
|
88
|
-
clustering_args.add_argument("-t", action="store", dest="
|
|
89
|
-
help="Default
|
|
87
|
+
clustering_args.add_argument("-t", action="store", dest="threads", type=int, default=8,
|
|
88
|
+
help="Default 8: Threads to be allocated for clustering and/or alignment.",
|
|
90
89
|
required=False)
|
|
91
90
|
|
|
92
91
|
###Partial-Mode Arguments
|
|
@@ -125,28 +124,19 @@ def main():
|
|
|
125
124
|
output_args.add_argument('-original_fasta', action='store', dest='original_fasta',
|
|
126
125
|
help='FASTA file to use in conjunction with "-w" or "-con" when running in Partial Mode.',
|
|
127
126
|
required=False)
|
|
128
|
-
output_args.add_argument('-
|
|
129
|
-
|
|
127
|
+
output_args.add_argument('-no_gpa', action='store_false', dest='gene_presence_absence_out',
|
|
128
|
+
help='Do not create a Roary/Panaroo formatted gene_presence_absence.csv (created by default) - Required for Coinfinder and other downstream tools',
|
|
130
129
|
required=False)
|
|
131
130
|
|
|
132
131
|
### Misc Arguments
|
|
133
132
|
misc = parser.add_argument_group('Misc')
|
|
134
|
-
misc.add_argument('-verbose', action='store_true', dest='verbose', default=None,
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
version = vparser.add_argument_group('Version')
|
|
139
|
-
version.add_argument('-v', action='store_true', dest='version',
|
|
133
|
+
misc.add_argument('-verbose', action='store_true', dest='verbose', default=None,
|
|
134
|
+
help='Default - False: Print out runtime messages',
|
|
135
|
+
required = False)
|
|
136
|
+
misc.add_argument('-v', action='store_true', dest='version',
|
|
140
137
|
help='Default - False: Print out version number and exit',
|
|
141
138
|
required=False)
|
|
142
139
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
args, unknown = vparser.parse_known_args()
|
|
146
|
-
|
|
147
|
-
if args.version == True:
|
|
148
|
-
sys.exit("PyamilySeq version: "+PyamilySeq_Version)
|
|
149
|
-
|
|
150
140
|
options = parser.parse_args()
|
|
151
141
|
|
|
152
142
|
### Checking all required parameters are provided by user #!!# Doesn't seem to work
|
|
@@ -265,6 +255,7 @@ def main():
|
|
|
265
255
|
self.output_dir = options.output_dir
|
|
266
256
|
self.gene_presence_absence_out = options.gene_presence_absence_out
|
|
267
257
|
self.write_groups = options.write_groups
|
|
258
|
+
self.threads = options.threads
|
|
268
259
|
self.align_core = options.align_core
|
|
269
260
|
self.fasta = combined_out_file
|
|
270
261
|
self.verbose = options.verbose
|
|
@@ -283,6 +274,7 @@ def main():
|
|
|
283
274
|
self.output_dir = options.output_dir
|
|
284
275
|
self.gene_presence_absence_out = options.gene_presence_absence_out
|
|
285
276
|
self.write_groups = options.write_groups
|
|
277
|
+
self.threads = options.threads
|
|
286
278
|
self.align_core = options.align_core
|
|
287
279
|
self.fasta = options.original_fasta
|
|
288
280
|
self.verbose = options.verbose
|
|
@@ -299,5 +291,5 @@ def main():
|
|
|
299
291
|
"Please report any issues to: https://github.com/NickJD/PyamilySeq/issues\n#####")
|
|
300
292
|
|
|
301
293
|
if __name__ == "__main__":
|
|
302
|
-
|
|
294
|
+
print("Running PyamilySeq "+PyamilySeq_Version)
|
|
303
295
|
main()
|
PyamilySeq/PyamilySeq_Genus.py
CHANGED
|
@@ -199,7 +199,7 @@ def cluster(options):
|
|
|
199
199
|
outfile.write("\nTotal Number of First Gene Groups That Had Additional Second Sequences But Not New Genomes: " + str(
|
|
200
200
|
Number_Of_Second_Extending_But_Same_Genomes))
|
|
201
201
|
|
|
202
|
-
if options.gene_presence_absence_out !=
|
|
202
|
+
if options.gene_presence_absence_out != False:
|
|
203
203
|
gene_presence_absence_output(options,genus_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted)
|
|
204
204
|
|
|
205
205
|
if options.run_mode == 'Full':
|
PyamilySeq/PyamilySeq_Species.py
CHANGED
|
@@ -12,6 +12,8 @@ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
|
|
|
12
12
|
from utils import *
|
|
13
13
|
|
|
14
14
|
|
|
15
|
+
#def output_fasta(options, gene_families):
|
|
16
|
+
|
|
15
17
|
def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
|
|
16
18
|
print("Outputting gene_presence_absence file")
|
|
17
19
|
output_dir = os.path.abspath(options.output_dir)
|
|
@@ -227,7 +229,7 @@ def cluster(options):
|
|
|
227
229
|
outfile.write("\nTotal Number of First Gene Groups That Had Additional Second Sequences But Not New Genomes: " + str(
|
|
228
230
|
Number_Of_Second_Extending_But_Same_Genomes))
|
|
229
231
|
#Report number of first and second clusters and do the ame for genus
|
|
230
|
-
if options.gene_presence_absence_out !=
|
|
232
|
+
if options.gene_presence_absence_out != False:
|
|
231
233
|
gene_presence_absence_output(options,genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted)
|
|
232
234
|
|
|
233
235
|
|
|
@@ -255,7 +257,6 @@ def cluster(options):
|
|
|
255
257
|
if options.write_groups != None and options.fasta != None:
|
|
256
258
|
print("Outputting gene group FASTA files")
|
|
257
259
|
sequences = read_fasta(options.fasta)
|
|
258
|
-
#output_dir = os.path.dirname(os.path.abspath(options.output_dir))
|
|
259
260
|
output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
|
|
260
261
|
write_groups(options,output_dir, key_order, cores, sequences,
|
|
261
262
|
pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
|
PyamilySeq/utils.py
CHANGED
|
@@ -5,6 +5,8 @@ import glob
|
|
|
5
5
|
import collections
|
|
6
6
|
from tempfile import NamedTemporaryFile
|
|
7
7
|
import sys
|
|
8
|
+
from line_profiler_pycharm import profile
|
|
9
|
+
import re
|
|
8
10
|
|
|
9
11
|
|
|
10
12
|
################### We are currently fixed using Table 11
|
|
@@ -30,6 +32,66 @@ def translate_frame(sequence):
|
|
|
30
32
|
translate = ''.join([gencode.get(sequence[3 * i:3 * i + 3], 'X') for i in range(len(sequence) // 3)])
|
|
31
33
|
return translate
|
|
32
34
|
|
|
35
|
+
@profile
|
|
36
|
+
def calculate_similarity(seq1, seq2):
|
|
37
|
+
len1, len2 = len(seq1), len(seq2)
|
|
38
|
+
|
|
39
|
+
# If lengths are the same, directly compare without alignment
|
|
40
|
+
if len1 == len2:
|
|
41
|
+
matches = sum(c1 == c2 for c1, c2 in zip(seq1, seq2))
|
|
42
|
+
return (matches / len1) * 100 # Return similarity based on the length
|
|
43
|
+
|
|
44
|
+
# For different lengths, proceed with global alignment
|
|
45
|
+
# Initialize the scoring matrix
|
|
46
|
+
score_matrix = [[0] * (len2 + 1) for _ in range(len1 + 1)]
|
|
47
|
+
|
|
48
|
+
# Fill the first row and first column with gap penalties
|
|
49
|
+
for i in range(len1 + 1):
|
|
50
|
+
score_matrix[i][0] = -i # Gap penalty for seq1
|
|
51
|
+
for j in range(len2 + 1):
|
|
52
|
+
score_matrix[0][j] = -j # Gap penalty for seq2
|
|
53
|
+
|
|
54
|
+
# Fill the score matrix
|
|
55
|
+
for i in range(1, len1 + 1):
|
|
56
|
+
for j in range(1, len2 + 1):
|
|
57
|
+
match = score_matrix[i - 1][j - 1] + (1 if seq1[i - 1] == seq2[j - 1] else -1)
|
|
58
|
+
delete = score_matrix[i - 1][j] - 1 # Gap in seq2
|
|
59
|
+
insert = score_matrix[i][j - 1] - 1 # Gap in seq1
|
|
60
|
+
score_matrix[i][j] = max(match, delete, insert)
|
|
61
|
+
|
|
62
|
+
# Traceback to find the alignment (if needed for detailed output)
|
|
63
|
+
aligned_seq1, aligned_seq2 = "", ""
|
|
64
|
+
i, j = len1, len2
|
|
65
|
+
|
|
66
|
+
while i > 0 or j > 0:
|
|
67
|
+
current_score = score_matrix[i][j]
|
|
68
|
+
if i > 0 and j > 0 and current_score == score_matrix[i - 1][j - 1] + (1 if seq1[i - 1] == seq2[j - 1] else -1):
|
|
69
|
+
aligned_seq1 += seq1[i - 1]
|
|
70
|
+
aligned_seq2 += seq2[j - 1]
|
|
71
|
+
i -= 1
|
|
72
|
+
j -= 1
|
|
73
|
+
elif i > 0 and current_score == score_matrix[i - 1][j] - 1:
|
|
74
|
+
aligned_seq1 += seq1[i - 1]
|
|
75
|
+
aligned_seq2 += "-"
|
|
76
|
+
i -= 1
|
|
77
|
+
else:
|
|
78
|
+
aligned_seq1 += "-"
|
|
79
|
+
aligned_seq2 += seq2[j - 1]
|
|
80
|
+
j -= 1
|
|
81
|
+
|
|
82
|
+
# Reverse the aligned sequences if needed
|
|
83
|
+
aligned_seq1 = aligned_seq1[::-1]
|
|
84
|
+
aligned_seq2 = aligned_seq2[::-1]
|
|
85
|
+
|
|
86
|
+
# Calculate matches from aligned sequences
|
|
87
|
+
matches = sum(c1 == c2 for c1, c2 in zip(aligned_seq1, aligned_seq2))
|
|
88
|
+
|
|
89
|
+
# Calculate the similarity percentage based on the maximum length
|
|
90
|
+
max_length = max(len(seq1), len(seq2))
|
|
91
|
+
return (matches / max_length) * 100
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
|
|
33
95
|
def is_tool_installed(tool_name):
|
|
34
96
|
"""Check if a tool is installed and available in PATH."""
|
|
35
97
|
# Check if the tool is in the system PATH
|
|
@@ -49,12 +111,23 @@ def reverse_complement(seq):
|
|
|
49
111
|
complement = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'N': 'N'}
|
|
50
112
|
return ''.join(complement[base] for base in reversed(seq))
|
|
51
113
|
|
|
114
|
+
|
|
52
115
|
def fix_path(path):
|
|
53
116
|
fixed_path = os.path.normpath(path)
|
|
54
117
|
fixed_path = os.path.realpath(fixed_path)
|
|
55
118
|
return fixed_path
|
|
56
119
|
|
|
57
120
|
|
|
121
|
+
def extract_identity(clustered_info):
|
|
122
|
+
# Use regular expressions to capture the percentage value at the end of the line
|
|
123
|
+
match = re.search(r'at ([-+]*)(\d+\.\d+)%', clustered_info)
|
|
124
|
+
|
|
125
|
+
if match:
|
|
126
|
+
percent_identity = float(match.group(2)) # Extract the percentage value
|
|
127
|
+
return percent_identity
|
|
128
|
+
else:
|
|
129
|
+
raise ValueError("Percent identity not found in the string.")
|
|
130
|
+
|
|
58
131
|
def wrap_sequence(sequence, width=60):
|
|
59
132
|
wrapped_sequence = []
|
|
60
133
|
for i in range(0, len(sequence), width):
|
|
@@ -111,14 +184,15 @@ def run_mafft_on_sequences(options, sequences, output_file):
|
|
|
111
184
|
with open(output_file, 'w') as output_f:
|
|
112
185
|
if options.verbose == True:
|
|
113
186
|
subprocess.run(
|
|
114
|
-
['mafft', '--auto', temp_input_file_path],
|
|
187
|
+
['mafft', '--auto', '--thread', str(options.threads), temp_input_file_path],
|
|
115
188
|
stdout=output_f,
|
|
116
189
|
stderr=sys.stderr,
|
|
117
190
|
check=True
|
|
118
191
|
)
|
|
192
|
+
|
|
119
193
|
else:
|
|
120
194
|
subprocess.run(
|
|
121
|
-
['mafft', '--auto', temp_input_file_path],
|
|
195
|
+
['mafft', '--auto', '--thread', str(options.threads), temp_input_file_path],
|
|
122
196
|
stdout=output_f,
|
|
123
197
|
stderr=subprocess.DEVNULL, # Suppress stderr
|
|
124
198
|
check=True
|
|
@@ -265,30 +339,57 @@ def read_fasta_files(input_dir, name_split, combined_out, translate):
|
|
|
265
339
|
combined_out_file.write(f">{genome_name}|{id}\n{wrapped_sequence}\n")
|
|
266
340
|
|
|
267
341
|
|
|
268
|
-
def write_groups(options,output_dir, key_order, cores, sequences,
|
|
342
|
+
def write_groups(options, output_dir, key_order, cores, sequences,
|
|
343
|
+
pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences):
|
|
344
|
+
"""
|
|
345
|
+
Writes individual FASTA files and a combined FASTA file for all sequences.
|
|
346
|
+
|
|
347
|
+
Parameters:
|
|
348
|
+
- options: Command-line options.
|
|
349
|
+
- output_dir: Directory where output FASTA files will be saved.
|
|
350
|
+
- key_order: The order in which to process keys.
|
|
351
|
+
- cores: Dictionary of core genes.
|
|
352
|
+
- sequences: Dictionary mapping headers to sequences.
|
|
353
|
+
- pangenome_clusters_First_sequences_sorted: Dictionary of first sequence clusters.
|
|
354
|
+
- combined_pangenome_clusters_Second_sequences: Dictionary of second sequence clusters.
|
|
355
|
+
"""
|
|
269
356
|
# Create output directory if it doesn't exist
|
|
270
357
|
if not os.path.exists(output_dir):
|
|
271
358
|
os.makedirs(output_dir)
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
359
|
+
|
|
360
|
+
combined_fasta_filename = os.path.join(output_dir, "combined_group_sequences.fasta")
|
|
361
|
+
|
|
362
|
+
# Open combined FASTA file for writing all sequences
|
|
363
|
+
with open(combined_fasta_filename, 'w') as combined_fasta:
|
|
364
|
+
for key_prefix in key_order:
|
|
365
|
+
for key, values in cores.items():
|
|
366
|
+
if any(part in options.write_groups.split(',') for part in key.split('_')):
|
|
367
|
+
if key.startswith(key_prefix):
|
|
368
|
+
for value in values:
|
|
369
|
+
output_filename = f"{key}_{value}.fasta"
|
|
370
|
+
if 'First' in key_prefix:
|
|
371
|
+
sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
|
|
372
|
+
else:
|
|
373
|
+
sequences_to_write = combined_pangenome_clusters_Second_sequences[value]
|
|
374
|
+
|
|
375
|
+
# Write individual FASTA file
|
|
376
|
+
with open(os.path.join(output_dir, output_filename), 'w') as outfile:
|
|
377
|
+
for header in sequences_to_write:
|
|
378
|
+
if header in sequences:
|
|
379
|
+
sequence = sequences[header]
|
|
380
|
+
outfile.write(f">{header}\n")
|
|
381
|
+
wrapped_sequence = wrap_sequence(sequence)
|
|
382
|
+
outfile.write(f"{wrapped_sequence}\n")
|
|
383
|
+
|
|
384
|
+
# Also write to the combined FASTA file
|
|
385
|
+
combined_fasta.write(f">Group_{value}|{header}\n")
|
|
386
|
+
combined_fasta.write(f"{wrapped_sequence}\n")
|
|
387
|
+
else:
|
|
388
|
+
if options.verbose:
|
|
389
|
+
print(f"Sequence {header} not found in original_fasta file.")
|
|
390
|
+
|
|
391
|
+
print(f"Combined FASTA file saved to: {combined_fasta_filename}")
|
|
392
|
+
|
|
292
393
|
|
|
293
394
|
def process_gene_families(options, directory, output_file):
|
|
294
395
|
"""Process each gene family file to select the longest sequence per genome and concatenate aligned sequences."""
|
|
@@ -297,7 +398,7 @@ def process_gene_families(options, directory, output_file):
|
|
|
297
398
|
|
|
298
399
|
# Iterate over each gene family file
|
|
299
400
|
for gene_file in os.listdir(directory):
|
|
300
|
-
if gene_file.endswith('.fasta'):
|
|
401
|
+
if gene_file.endswith('.fasta') and not gene_file.endswith('combined_group_sequences.fasta'):
|
|
301
402
|
gene_path = os.path.join(directory, gene_file)
|
|
302
403
|
|
|
303
404
|
# Read sequences from the gene family file
|
|
@@ -307,13 +408,15 @@ def process_gene_families(options, directory, output_file):
|
|
|
307
408
|
longest_sequences = select_longest_gene(sequences)
|
|
308
409
|
|
|
309
410
|
# Run mafft on the longest sequences
|
|
310
|
-
aligned_file = f"{gene_file}_aligned.fasta"
|
|
411
|
+
aligned_file = f"{directory}/{gene_file}_aligned.fasta.tmp"
|
|
311
412
|
run_mafft_on_sequences(options, {seq_id: seq for seq_id, seq in longest_sequences.values()}, aligned_file)
|
|
312
413
|
|
|
313
414
|
# Read aligned sequences and concatenate them
|
|
314
415
|
aligned_sequences = read_fasta(aligned_file)
|
|
315
416
|
for genome, aligned_seq in aligned_sequences.items():
|
|
316
417
|
genome_name = genome.split('|')[0]
|
|
418
|
+
if 'Group' in genome_name:
|
|
419
|
+
print(2)
|
|
317
420
|
if genome_name not in concatenated_sequences:
|
|
318
421
|
concatenated_sequences[genome_name] = ""
|
|
319
422
|
concatenated_sequences[genome_name] += aligned_seq
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: PyamilySeq
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.1
|
|
4
4
|
Summary: PyamilySeq - A a tool to look for sequence-based gene groups identified by clustering methods such as CD-HIT, DIAMOND, BLAST or MMseqs2.
|
|
5
5
|
Home-page: https://github.com/NickJD/PyamilySeq
|
|
6
6
|
Author: Nicholas Dimonaco
|
|
@@ -58,7 +58,7 @@ Escherichia_coli_110957|ENSB:TIZS9kbTvShDvyX Escherichia_coli_110957|ENSB:TIZS9k
|
|
|
58
58
|
```
|
|
59
59
|
### Example output:
|
|
60
60
|
```
|
|
61
|
-
Running PyamilySeq v0.
|
|
61
|
+
Running PyamilySeq v0.8.1
|
|
62
62
|
Calculating Groups
|
|
63
63
|
Gene Groups:
|
|
64
64
|
First_core_99: 2682
|
|
@@ -80,7 +80,7 @@ PyamilySeq -run_mode Partial -group_mode Genus -clustering_format CD-HIT -output
|
|
|
80
80
|
-cluster_file .../test_data/genus/CD-HIT/combined_cds_cd-hit_80_60.clstr -gpa
|
|
81
81
|
```
|
|
82
82
|
```commandline
|
|
83
|
-
Running PyamilySeq v0.
|
|
83
|
+
Running PyamilySeq v0.8.1
|
|
84
84
|
Calculating Groups
|
|
85
85
|
Genus Groups:
|
|
86
86
|
First_genera_1: 28549
|
|
@@ -137,14 +137,14 @@ Please report any issues to: https://github.com/NickJD/PyamilySeq/issues
|
|
|
137
137
|
## PyamilySeq - Menu:
|
|
138
138
|
### PyamilySeq is separated into two main 'run modes', Full and Partial. They each have their own set of required and optional arguments.
|
|
139
139
|
```
|
|
140
|
-
Running PyamilySeq v0.
|
|
140
|
+
Running PyamilySeq v0.8.1
|
|
141
141
|
usage: PyamilySeq.py [-h] -run_mode {Full,Partial} -group_mode {Species,Genus} -clustering_format {CD-HIT,TSV,CSV} -output_dir OUTPUT_DIR
|
|
142
142
|
[-input_type {separate,combined}] [-input_dir INPUT_DIR] [-name_split NAME_SPLIT] [-sequence_type {AA,DNA}] [-gene_ident GENE_IDENT]
|
|
143
143
|
[-pid PIDENT] [-len_diff LEN_DIFF] [-mem CLUSTERING_MEMORY] [-t CLUSTERING_THREADS] [-cluster_file CLUSTER_FILE]
|
|
144
144
|
[-reclustered RECLUSTERED] [-seq_tag SEQUENCE_TAG] [-core_groups CORE_GROUPS] [-genus_groups GENUS_GROUPS] [-w WRITE_GROUPS] [-a]
|
|
145
145
|
[-original_fasta ORIGINAL_FASTA] [-gpa] [-verbose] [-v]
|
|
146
146
|
|
|
147
|
-
PyamilySeq v0.
|
|
147
|
+
PyamilySeq v0.8.1: A tool that groups genes into unique clusters.
|
|
148
148
|
|
|
149
149
|
options:
|
|
150
150
|
-h, --help show this help message and exit
|
|
@@ -176,8 +176,9 @@ Full-Mode Arguments - Required when "-run_mode Full" is used:
|
|
|
176
176
|
Clustering Runtime Arguments - Optional when "-run_mode Full" is used:
|
|
177
177
|
-mem CLUSTERING_MEMORY
|
|
178
178
|
Default 4000: Memory to be allocated for clustering (in MBs).
|
|
179
|
-
-t
|
|
180
|
-
|
|
179
|
+
-t THREADS Default 8: Threads to be allocated for clustering
|
|
180
|
+
and/or alignment.
|
|
181
|
+
|
|
181
182
|
|
|
182
183
|
Partial-Mode Arguments - Required when "-run_mode Partial" is used:
|
|
183
184
|
-cluster_file CLUSTER_FILE
|
|
@@ -221,7 +222,7 @@ Seq-Combiner -input_dir .../test_data/genomes -name_split _combined.gff3 -output
|
|
|
221
222
|
```
|
|
222
223
|
usage: Seq_Combiner.py [-h] -input_dir INPUT_DIR -input_type {separate,combined,fasta} -name_split NAME_SPLIT -output_dir OUTPUT_DIR -output_name OUTPUT_FILE [-gene_ident GENE_IDENT] [-translate] [-v]
|
|
223
224
|
|
|
224
|
-
Seq-Combiner v0.
|
|
225
|
+
Seq-Combiner v0.8.1: A tool to extract sequences from GFF/FASTA files.
|
|
225
226
|
|
|
226
227
|
options:
|
|
227
228
|
-h, --help show this help message and exit
|
|
@@ -247,4 +248,40 @@ Misc Arguments:
|
|
|
247
248
|
|
|
248
249
|
|
|
249
250
|
```
|
|
251
|
+
|
|
252
|
+
### Group-Splitter menu:
|
|
253
|
+
|
|
254
|
+
```
|
|
255
|
+
usage: Group_Splitter.py [-h] -input_fasta INPUT_FASTA -output_dir OUTPUT_DIR [-pident PIDENT] [-len_diff LEN_DIFF] [-clustering_threads CLUSTERING_THREADS]
|
|
256
|
+
[-clustering_memory CLUSTERING_MEMORY] [-percent_threshold PERCENT_THRESHOLD] [-verbose] [-delete_temp_files] [-v]
|
|
257
|
+
|
|
258
|
+
Group-Splitter: v0.8.1: A tool to split "paralogous" groups identified by PyamilySeq.
|
|
259
|
+
|
|
260
|
+
options:
|
|
261
|
+
-h, --help show this help message and exit
|
|
262
|
+
|
|
263
|
+
Required Arguments:
|
|
264
|
+
-input_fasta INPUT_FASTA
|
|
265
|
+
Input FASTA file containing gene groups.
|
|
266
|
+
-sequence_type {AA,DNA}
|
|
267
|
+
Default - DNA: Are groups "DNA" or "AA" sequences?
|
|
268
|
+
-output_dir OUTPUT_DIR
|
|
269
|
+
Output directory.
|
|
270
|
+
|
|
271
|
+
Optional Arguments:
|
|
272
|
+
-pident PIDENT Sequence identity threshold (default: 0.9)
|
|
273
|
+
-len_diff LEN_DIFF Length difference threshold (default: 0.05)
|
|
274
|
+
-clustering_threads CLUSTERING_THREADS
|
|
275
|
+
Number of threads for clustering (default: 4)
|
|
276
|
+
-clustering_memory CLUSTERING_MEMORY
|
|
277
|
+
Memory limit in MB for clustering (default: 2000)
|
|
278
|
+
-percent_threshold PERCENT_THRESHOLD
|
|
279
|
+
Minimum percentage of genomes with paralogs (default: 80.0)
|
|
280
|
+
-verbose Print verbose output.
|
|
281
|
+
-delete_temp_files Delete all temporary files after processing.
|
|
282
|
+
|
|
283
|
+
Misc Arguments:
|
|
284
|
+
-v Print out version number and exit
|
|
285
|
+
```
|
|
286
|
+
|
|
250
287
|
### All example input and output data can be found in the 'test_data' directory.
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
PyamilySeq/Constants.py,sha256=J_jZheqHCbmFVCLrY8nMe4T5VZQOQ7PbT_HmYSi58WM,31
|
|
2
|
+
PyamilySeq/Group_Splitter.py,sha256=wrz-vcQ2gJ40MLLczFY8te35_uYrOBuh2v-fJSIVsWo,15578
|
|
3
|
+
PyamilySeq/PyamilySeq.py,sha256=OAtz6b7dnvA-Qg0dnf2JXImiOtsDrDfVit7Q6DFbuPU,15265
|
|
4
|
+
PyamilySeq/PyamilySeq_Genus.py,sha256=hC34cHIFu8YaXXgcPyVwuWENlsxx-7mT-Qr6PAdio4U,12414
|
|
5
|
+
PyamilySeq/PyamilySeq_Species.py,sha256=spgS-h-lrySZBiOiB6jX6pPRaL5j8f5V1Hq3XOjBOko,14404
|
|
6
|
+
PyamilySeq/Seq_Combiner.py,sha256=dPDu6LlT3B-ZDn3wKZ3AeWraDgv2Tub_16l9CLc3tQ0,3353
|
|
7
|
+
PyamilySeq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
+
PyamilySeq/clusterings.py,sha256=rcWFv0IiWoS4aUNRjDDwNEL86l1wIKa4vK4htAxy8Hg,18787
|
|
9
|
+
PyamilySeq/utils.py,sha256=vjPSIua4E72JTWlzH4CUaRcR-Z6Nr-RQ9N_92tfZI_w,19686
|
|
10
|
+
PyamilySeq-0.8.1.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
11
|
+
PyamilySeq-0.8.1.dist-info/METADATA,sha256=weIjFQkc7ggqkPlPkSA5an8eFiUzhDyxGl9t7-rJPsA,14555
|
|
12
|
+
PyamilySeq-0.8.1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
|
13
|
+
PyamilySeq-0.8.1.dist-info/entry_points.txt,sha256=15BsozBN6vRWvZeQon05dY4YQT7DqP5i2TUqFWRGCvc,150
|
|
14
|
+
PyamilySeq-0.8.1.dist-info/top_level.txt,sha256=J6JhugUQTq4rq96yibAlQu3o4KCM9WuYfqr3w1r119M,11
|
|
15
|
+
PyamilySeq-0.8.1.dist-info/RECORD,,
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
PyamilySeq/Constants.py,sha256=4MNcQLwJguoC9fHBLbreAe-GNgNvtzYrF0MBM6BFY_s,31
|
|
2
|
-
PyamilySeq/PyamilySeq.py,sha256=RbM6G1yU64jlb9r7QRry1vw5mQsxndM6TrvMvq3BVik,15466
|
|
3
|
-
PyamilySeq/PyamilySeq_Genus.py,sha256=ZjD61mTW7NgmsfGfFVEXeIZoSCha9PaLtMPnqdTtacU,12413
|
|
4
|
-
PyamilySeq/PyamilySeq_Species.py,sha256=WL6pu8hlGpnemcpu1tLFmlUlPd4vJpQSW4Om5Hclu_k,14438
|
|
5
|
-
PyamilySeq/Seq_Combiner.py,sha256=dPDu6LlT3B-ZDn3wKZ3AeWraDgv2Tub_16l9CLc3tQ0,3353
|
|
6
|
-
PyamilySeq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
-
PyamilySeq/clusterings.py,sha256=rcWFv0IiWoS4aUNRjDDwNEL86l1wIKa4vK4htAxy8Hg,18787
|
|
8
|
-
PyamilySeq/utils.py,sha256=-0OZxmX96kOTzms8gnbFBvc5DL6NsqNHNpLpQ4UjNk8,15726
|
|
9
|
-
PyamilySeq-0.7.1.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
10
|
-
PyamilySeq-0.7.1.dist-info/METADATA,sha256=IpbThlfEmO-S8Nl617eQB64Xzu9GJDz19L4Jhx7lwGY,13076
|
|
11
|
-
PyamilySeq-0.7.1.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
|
|
12
|
-
PyamilySeq-0.7.1.dist-info/entry_points.txt,sha256=QtXD1tmnLvRAkIpGWZgXm1lfLH8GGeCwxmgoHZaTp98,102
|
|
13
|
-
PyamilySeq-0.7.1.dist-info/top_level.txt,sha256=J6JhugUQTq4rq96yibAlQu3o4KCM9WuYfqr3w1r119M,11
|
|
14
|
-
PyamilySeq-0.7.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|