PyamilySeq 0.8.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyamilySeq-1.0.0.dist-info/METADATA +17 -0
- PyamilySeq-1.0.0.dist-info/RECORD +6 -0
- {PyamilySeq-0.8.1.dist-info → PyamilySeq-1.0.0.dist-info}/WHEEL +1 -1
- PyamilySeq-1.0.0.dist-info/entry_points.txt +2 -0
- PyamilySeq-1.0.0.dist-info/top_level.txt +1 -0
- PyamilySeq/Constants.py +0 -2
- PyamilySeq/Group_Splitter.py +0 -350
- PyamilySeq/PyamilySeq.py +0 -295
- PyamilySeq/PyamilySeq_Genus.py +0 -242
- PyamilySeq/PyamilySeq_Species.py +0 -287
- PyamilySeq/Seq_Combiner.py +0 -63
- PyamilySeq/__init__.py +0 -0
- PyamilySeq/clusterings.py +0 -362
- PyamilySeq/utils.py +0 -432
- PyamilySeq-0.8.1.dist-info/METADATA +0 -287
- PyamilySeq-0.8.1.dist-info/RECORD +0 -15
- PyamilySeq-0.8.1.dist-info/entry_points.txt +0 -4
- PyamilySeq-0.8.1.dist-info/top_level.txt +0 -1
- {PyamilySeq-0.8.1.dist-info → PyamilySeq-1.0.0.dist-info}/LICENSE +0 -0
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: PyamilySeq
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: rORForise - A a tool to study read-level gene predictions.
|
|
5
|
+
Home-page: https://github.com/NickJD/rORForise
|
|
6
|
+
Author: Nicholas Dimonaco
|
|
7
|
+
Author-email: nicholas@dimonaco.co.uk
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/NickJD/rORForise/issues
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Requires-Python: >=3.6
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
|
|
16
|
+
# rORForise
|
|
17
|
+
Read-based gene coverage evaluation
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
PyamilySeq-1.0.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
2
|
+
PyamilySeq-1.0.0.dist-info/METADATA,sha256=AmvKK-9jDxFly93v2XT9WpmdU6n1jEPHCw7CgHr7ktM,608
|
|
3
|
+
PyamilySeq-1.0.0.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
|
|
4
|
+
PyamilySeq-1.0.0.dist-info/entry_points.txt,sha256=Ip84PS-IG05XWHiA98MiXE9AJVmqTa5O7BQ2cywrDoo,49
|
|
5
|
+
PyamilySeq-1.0.0.dist-info/top_level.txt,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
6
|
+
PyamilySeq-1.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
PyamilySeq/Constants.py
DELETED
PyamilySeq/Group_Splitter.py
DELETED
|
@@ -1,350 +0,0 @@
|
|
|
1
|
-
import subprocess
|
|
2
|
-
import os
|
|
3
|
-
import argparse
|
|
4
|
-
from collections import defaultdict, OrderedDict
|
|
5
|
-
from line_profiler_pycharm import profile
|
|
6
|
-
|
|
7
|
-
try:
|
|
8
|
-
from .Constants import *
|
|
9
|
-
from .utils import *
|
|
10
|
-
except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
|
|
11
|
-
from Constants import *
|
|
12
|
-
from utils import *
|
|
13
|
-
|
|
14
|
-
def run_cd_hit(options, input_file, clustering_output, clustering_mode):
|
|
15
|
-
cdhit_command = [
|
|
16
|
-
clustering_mode,
|
|
17
|
-
'-i', input_file,
|
|
18
|
-
'-o', clustering_output,
|
|
19
|
-
'-c', str(options.pident),
|
|
20
|
-
'-s', str(options.len_diff),
|
|
21
|
-
'-T', str(options.clustering_threads),
|
|
22
|
-
'-M', str(options.clustering_memory),
|
|
23
|
-
'-d', "0",
|
|
24
|
-
'-sc', "1",
|
|
25
|
-
'-sf', "1"
|
|
26
|
-
]
|
|
27
|
-
if options.verbose:
|
|
28
|
-
subprocess.run(cdhit_command)
|
|
29
|
-
else:
|
|
30
|
-
subprocess.run(cdhit_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
def calculate_new_rep_seq(cluster_data):
|
|
34
|
-
total_length = sum(entry['length'] for entry in cluster_data)
|
|
35
|
-
avg_length = total_length / len(cluster_data)
|
|
36
|
-
|
|
37
|
-
total_identity = sum(entry['percent_identity'] for entry in cluster_data)
|
|
38
|
-
avg_identity = total_identity / len(cluster_data)
|
|
39
|
-
|
|
40
|
-
# Calculate a score based on both length difference and percent identity
|
|
41
|
-
def score(entry):
|
|
42
|
-
length_diff = abs(entry['length'] - avg_length)
|
|
43
|
-
identity_diff = abs(entry['percent_identity'] - avg_identity)
|
|
44
|
-
return length_diff + (100 - identity_diff) # You can weight these differently
|
|
45
|
-
|
|
46
|
-
rep_entry = min(cluster_data, key=score)
|
|
47
|
-
return rep_entry
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
def length_within_threshold(rep_length, length, len_diff):
|
|
51
|
-
return abs(rep_length - length) / rep_length <= len_diff
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
def check_if_all_identical(clustered_sequences):
|
|
55
|
-
lengths = {entry['length'] for cluster in clustered_sequences.values() for entry in cluster}
|
|
56
|
-
perc_idents = {entry['percent_identity'] for cluster in clustered_sequences.values() for entry in cluster}
|
|
57
|
-
|
|
58
|
-
return len(lengths) == 1 and len(perc_idents) == 1
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
def read_fasta_groups(fasta_file):
|
|
62
|
-
groups = defaultdict(list)
|
|
63
|
-
genome_count = defaultdict(int)
|
|
64
|
-
current_group = None
|
|
65
|
-
current_sequence = []
|
|
66
|
-
|
|
67
|
-
with open(fasta_file, 'r') as f:
|
|
68
|
-
for line in f:
|
|
69
|
-
if line.startswith('>'):
|
|
70
|
-
if current_group is not None:
|
|
71
|
-
groups[current_group].append((current_group_header, ''.join(current_sequence)))
|
|
72
|
-
|
|
73
|
-
current_group_header = line.strip()
|
|
74
|
-
current_group = current_group_header.split('|')[0]
|
|
75
|
-
genome = current_group_header.split('|')[1]
|
|
76
|
-
current_sequence = []
|
|
77
|
-
genome_count[genome] += 1
|
|
78
|
-
else:
|
|
79
|
-
current_sequence.append(line.strip())
|
|
80
|
-
|
|
81
|
-
if current_group is not None:
|
|
82
|
-
groups[current_group].append((current_group_header, ''.join(current_sequence)))
|
|
83
|
-
|
|
84
|
-
return groups, genome_count
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
def write_fasta(sequences, output_file):
|
|
88
|
-
with open(output_file, 'w') as f:
|
|
89
|
-
for header, seq in sequences:
|
|
90
|
-
f.write(f"{header}\n{seq}\n")
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
def read_cd_hit_output(clustering_output):
|
|
94
|
-
clusters = OrderedDict()
|
|
95
|
-
|
|
96
|
-
with open(clustering_output, 'r') as f:
|
|
97
|
-
current_cluster_id = None
|
|
98
|
-
|
|
99
|
-
for line in f:
|
|
100
|
-
line = line.strip()
|
|
101
|
-
if line.startswith(">Cluster"):
|
|
102
|
-
current_cluster_id = line.split(' ')[1]
|
|
103
|
-
clusters[current_cluster_id] = []
|
|
104
|
-
elif line and current_cluster_id is not None:
|
|
105
|
-
parts = line.split('\t')
|
|
106
|
-
if len(parts) > 1:
|
|
107
|
-
clustered_info = parts[1]
|
|
108
|
-
length = clustered_info.split(',')[0]
|
|
109
|
-
length = int(''.join(c for c in length if c.isdigit()))
|
|
110
|
-
clustered_header = clustered_info.split('>')[1].split('...')[0]
|
|
111
|
-
clustered_header = '>' + clustered_header
|
|
112
|
-
|
|
113
|
-
if 'at' in clustered_info:
|
|
114
|
-
percent_identity = extract_identity(line)
|
|
115
|
-
|
|
116
|
-
elif '*' in line:
|
|
117
|
-
percent_identity = 100.0
|
|
118
|
-
else:
|
|
119
|
-
raise ValueError("Percent identity not found in the string.")
|
|
120
|
-
|
|
121
|
-
clusters[current_cluster_id].append({
|
|
122
|
-
'header': clustered_header,
|
|
123
|
-
'length': length,
|
|
124
|
-
'percent_identity': percent_identity
|
|
125
|
-
})
|
|
126
|
-
|
|
127
|
-
return clusters
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
def separate_groups(input_fasta, options, clustering_mode):
|
|
131
|
-
groups, genome_count = read_fasta_groups(input_fasta)
|
|
132
|
-
|
|
133
|
-
paralog_groups = defaultdict(int) # To track number of paralog groups
|
|
134
|
-
|
|
135
|
-
for group_header, sequences in groups.items():
|
|
136
|
-
group_name = group_header.split('|')[0] # Get the group part (e.g., '>Group_n')
|
|
137
|
-
|
|
138
|
-
# Count genomes with more than one gene
|
|
139
|
-
genome_to_gene_count = defaultdict(int)
|
|
140
|
-
for header, _ in sequences:
|
|
141
|
-
genome = header.split('|')[1]
|
|
142
|
-
genome_to_gene_count[genome] += 1
|
|
143
|
-
|
|
144
|
-
num_genomes_with_multiple_genes = sum(1 for count in genome_to_gene_count.values() if count > 1)
|
|
145
|
-
total_genomes = len(genome_to_gene_count)
|
|
146
|
-
|
|
147
|
-
# Check if the group meets the threshold for having paralogs
|
|
148
|
-
if total_genomes == 0 or (num_genomes_with_multiple_genes / total_genomes) * 100 < options.percent_threshold:
|
|
149
|
-
continue
|
|
150
|
-
|
|
151
|
-
group_file_name = group_name.replace('>','')
|
|
152
|
-
|
|
153
|
-
temp_fasta = f"{options.output_dir}/{group_file_name}.fasta"
|
|
154
|
-
write_fasta(sequences, temp_fasta)
|
|
155
|
-
|
|
156
|
-
# Run cd-hit on the individual group
|
|
157
|
-
clustering_output = f"{options.output_dir}/{group_file_name}_clustering"
|
|
158
|
-
|
|
159
|
-
run_cd_hit(options, temp_fasta, clustering_output, clustering_mode)
|
|
160
|
-
|
|
161
|
-
# Read the clustering results to find subgroups
|
|
162
|
-
clustered_sequences = read_cd_hit_output(clustering_output + '.clstr')
|
|
163
|
-
|
|
164
|
-
# Detect if all sequences are identical in length and percentage identity
|
|
165
|
-
all_same = check_if_all_identical(clustered_sequences)
|
|
166
|
-
|
|
167
|
-
# **Global subgroup counter for the entire major group**
|
|
168
|
-
subgroup_id = 0
|
|
169
|
-
remaining_sequences = sequences.copy() # Track unprocessed sequences
|
|
170
|
-
sequences_to_remove = []
|
|
171
|
-
|
|
172
|
-
if not all_same:
|
|
173
|
-
while remaining_sequences:
|
|
174
|
-
# Track subgroups for this pass
|
|
175
|
-
subgroup_sequences = []
|
|
176
|
-
genome_seen = set()
|
|
177
|
-
sequences_found = False # Track if any sequence was added
|
|
178
|
-
|
|
179
|
-
# Recalculate representative sequence dynamically based on remaining genes
|
|
180
|
-
rep = calculate_new_rep_seq(
|
|
181
|
-
[entry for cluster in clustered_sequences.values() for entry in cluster if
|
|
182
|
-
entry['header'] in (h for h, _ in remaining_sequences)]
|
|
183
|
-
)
|
|
184
|
-
|
|
185
|
-
# Find the sequence corresponding to rep['header'] from the list of sequences
|
|
186
|
-
rep_seq = next((seq for header, seq in sequences if header == rep['header']), None)
|
|
187
|
-
|
|
188
|
-
# Process each genome to select the best matching sequence
|
|
189
|
-
for genome in genome_to_gene_count:
|
|
190
|
-
best_sequence = None
|
|
191
|
-
best_score = -1 # Initialize with a very low similarity score
|
|
192
|
-
|
|
193
|
-
# Iterate over each sequence in the remaining sequences for this genome
|
|
194
|
-
for header, seq in remaining_sequences:
|
|
195
|
-
genome_id = header.split('|')[1]
|
|
196
|
-
|
|
197
|
-
if genome_id == genome: # Ensure this sequence belongs to the current genome
|
|
198
|
-
|
|
199
|
-
length = len(seq)
|
|
200
|
-
if rep_seq == seq:
|
|
201
|
-
perc_ident = 100.0
|
|
202
|
-
else:
|
|
203
|
-
perc_ident = calculate_similarity(rep_seq, seq) # Define a function to calculate similarity
|
|
204
|
-
|
|
205
|
-
# Calculate the length difference ratio (smaller ratio means closer length to the representative)
|
|
206
|
-
length_diff_ratio = abs(rep['length'] - length) / rep['length']
|
|
207
|
-
|
|
208
|
-
# Check if this sequence is more similar than the current best one
|
|
209
|
-
if length_within_threshold(rep['length'], length,
|
|
210
|
-
options.len_diff) and perc_ident >= options.pident:
|
|
211
|
-
|
|
212
|
-
# Combine percentage identity and length difference into a single score
|
|
213
|
-
# Here, you want a high identity and a small length difference
|
|
214
|
-
# Adjust the weight of length difference and similarity according to your requirements
|
|
215
|
-
score = perc_ident - (length_diff_ratio * 100) # Weighting length diff (you can adjust the *100 factor)
|
|
216
|
-
|
|
217
|
-
# Check if this sequence has a higher score than the current best
|
|
218
|
-
if score > best_score:
|
|
219
|
-
best_score = score
|
|
220
|
-
best_sequence = (header, seq) # Store the best matching sequence for this genome
|
|
221
|
-
|
|
222
|
-
# Once the best sequence is identified, add it to the subgroup
|
|
223
|
-
if best_sequence is not None:
|
|
224
|
-
sequences_found = True # At least one sequence was added
|
|
225
|
-
new_header = f">{group_file_name}_subgroup_{subgroup_id}|{best_sequence[0].split('|')[1]}|{best_sequence[0].split('|')[2]}"
|
|
226
|
-
subgroup_sequences.append((new_header, best_sequence[1]))
|
|
227
|
-
sequences_to_remove.append(best_sequence)
|
|
228
|
-
genome_seen.add(genome)
|
|
229
|
-
|
|
230
|
-
# If no sequences were found for this pass, exit the loop
|
|
231
|
-
# if not sequences_found:
|
|
232
|
-
# break
|
|
233
|
-
|
|
234
|
-
# Write each subgroup into a separate FASTA file
|
|
235
|
-
if subgroup_sequences:
|
|
236
|
-
subgroup_file = f"{options.output_dir}/{group_file_name}_subgroup_{subgroup_id}.fasta"
|
|
237
|
-
write_fasta(subgroup_sequences, subgroup_file)
|
|
238
|
-
|
|
239
|
-
# Remove processed sequences from the remaining list
|
|
240
|
-
remaining_sequences = [item for item in remaining_sequences if
|
|
241
|
-
item[0] not in {h for h, _ in sequences_to_remove}]
|
|
242
|
-
|
|
243
|
-
# Increment subgroup ID globally for the next subgroup
|
|
244
|
-
subgroup_id += 1
|
|
245
|
-
paralog_groups[group_name] += 1 # Count this group as a paralog group
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
else:
|
|
249
|
-
# Condition 2: If sequences are identical, distribute genes evenly into subgroups
|
|
250
|
-
num_subgroups = 1000
|
|
251
|
-
subgroup_sequences = defaultdict(list) # Store sequences for each subgroup
|
|
252
|
-
genome_count = defaultdict(int) # Count how many genes have been assigned to each genome
|
|
253
|
-
|
|
254
|
-
# Iterate over all sequences regardless of whether the genome has been seen
|
|
255
|
-
for header, seq in sequences:
|
|
256
|
-
genome = header.split('|')[1]
|
|
257
|
-
|
|
258
|
-
# Determine the next subgroup for this genome
|
|
259
|
-
subgroup_id = genome_count[genome] % num_subgroups
|
|
260
|
-
new_header = f"{group_file_name}_subgroup_{subgroup_id}|{genome}|{header.split('|')[2]}"
|
|
261
|
-
subgroup_sequences[subgroup_id].append((new_header, seq))
|
|
262
|
-
|
|
263
|
-
# Increment the count for this genome
|
|
264
|
-
genome_count[genome] += 1
|
|
265
|
-
|
|
266
|
-
# Write out each subgroup to a separate FASTA file
|
|
267
|
-
for subgroup_id, seqs in subgroup_sequences.items():
|
|
268
|
-
subgroup_file = f"{options.output_dir}/{group_file_name}_subgroup_{subgroup_id}.fasta"
|
|
269
|
-
write_fasta(seqs, subgroup_file)
|
|
270
|
-
|
|
271
|
-
# Increment subgroup ID globally for the next subgroup
|
|
272
|
-
subgroup_id += 1
|
|
273
|
-
paralog_groups[group_name] += 1 # Count this group as a paralog group
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
# Clean up temporary fasta file if the option is set
|
|
278
|
-
if options.delete_temp_files:
|
|
279
|
-
if temp_fasta and os.path.exists(temp_fasta):
|
|
280
|
-
os.remove(temp_fasta)
|
|
281
|
-
if os.path.exists(clustering_output + '.clstr'):
|
|
282
|
-
os.remove(clustering_output + '.clstr')
|
|
283
|
-
if os.path.exists(clustering_output):
|
|
284
|
-
os.remove(clustering_output)
|
|
285
|
-
|
|
286
|
-
# Print metrics about paralog groups
|
|
287
|
-
print(f"Identified {len(paralog_groups)} paralog groups:")
|
|
288
|
-
for group_id, count in paralog_groups.items():
|
|
289
|
-
print(f"Group ID: {group_id}, Number of new groups: {count}")
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
def main():
|
|
293
|
-
parser = argparse.ArgumentParser(description='Group-Splitter: ' + PyamilySeq_Version + ': A tool to split "paralogous" groups identified by PyamilySeq.')
|
|
294
|
-
### Required Arguments
|
|
295
|
-
required = parser.add_argument_group('Required Arguments')
|
|
296
|
-
required.add_argument('-input_fasta', action='store', dest='input_fasta',
|
|
297
|
-
help='Input FASTA file containing gene groups.',
|
|
298
|
-
required=True)
|
|
299
|
-
required.add_argument('-sequence_type', action='store', dest='sequence_type', default='DNA',choices=['AA', 'DNA'],
|
|
300
|
-
help='Default - DNA: Are groups "DNA" or "AA" sequences?',
|
|
301
|
-
required=False)
|
|
302
|
-
required.add_argument('-output_dir', action='store', dest='output_dir',
|
|
303
|
-
help='Output directory.',
|
|
304
|
-
required=True)
|
|
305
|
-
|
|
306
|
-
optional = parser.add_argument_group('Optional Arguments')
|
|
307
|
-
|
|
308
|
-
optional.add_argument('-pident', action='store', dest='pident', type=float, default=0.9,
|
|
309
|
-
help='Sequence identity threshold (default: 0.9)')
|
|
310
|
-
optional.add_argument('-len_diff', action='store', dest='len_diff', type=float, default=0.05,
|
|
311
|
-
help='Length difference threshold (default: 0.05)')
|
|
312
|
-
optional.add_argument('-clustering_threads', action='store', dest='clustering_threads', type=int, default=4,
|
|
313
|
-
help='Number of threads for clustering (default: 4)')
|
|
314
|
-
optional.add_argument('-clustering_memory', action='store', dest='clustering_memory', type=int, default=2000,
|
|
315
|
-
help='Memory limit in MB for clustering (default: 2000)')
|
|
316
|
-
optional.add_argument('-percent_threshold', action='store', dest='percent_threshold', type=float, default=80,
|
|
317
|
-
help='Minimum percentage of genomes with paralogs (default: 80.0)')
|
|
318
|
-
optional.add_argument('-verbose', action='store_true', dest='verbose', help='Print verbose output.')
|
|
319
|
-
optional.add_argument('-no_delete_temp_files', action='store_false', dest='delete_temp_files',
|
|
320
|
-
help='Default: Delete all temporary files after processing.')
|
|
321
|
-
|
|
322
|
-
misc = parser.add_argument_group('Misc Arguments')
|
|
323
|
-
misc.add_argument('-v', action='store_true', dest='version',
|
|
324
|
-
help='Print out version number and exit',
|
|
325
|
-
required=False)
|
|
326
|
-
|
|
327
|
-
options = parser.parse_args()
|
|
328
|
-
|
|
329
|
-
# Check for version flag
|
|
330
|
-
if options.version:
|
|
331
|
-
print(f"Group-Splitter version {PyamilySeq_Version}")
|
|
332
|
-
exit(0)
|
|
333
|
-
|
|
334
|
-
options = parser.parse_args()
|
|
335
|
-
|
|
336
|
-
if not os.path.exists(options.output_dir):
|
|
337
|
-
os.makedirs(options.output_dir)
|
|
338
|
-
|
|
339
|
-
if options.sequence_type == 'DNA':
|
|
340
|
-
clustering_mode = 'cd-hit-est'
|
|
341
|
-
else:
|
|
342
|
-
clustering_mode = 'cd-hit'
|
|
343
|
-
|
|
344
|
-
separate_groups(options.input_fasta, options, clustering_mode)
|
|
345
|
-
|
|
346
|
-
print("Done")
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
if __name__ == "__main__":
|
|
350
|
-
main()
|