PyamilySeq 0.8.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,17 @@
1
+ Metadata-Version: 2.1
2
+ Name: PyamilySeq
3
+ Version: 1.0.0
4
+ Summary: rORForise - A a tool to study read-level gene predictions.
5
+ Home-page: https://github.com/NickJD/rORForise
6
+ Author: Nicholas Dimonaco
7
+ Author-email: nicholas@dimonaco.co.uk
8
+ Project-URL: Bug Tracker, https://github.com/NickJD/rORForise/issues
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.6
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+
16
+ # rORForise
17
+ Read-based gene coverage evaluation
@@ -0,0 +1,6 @@
1
+ PyamilySeq-1.0.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
2
+ PyamilySeq-1.0.0.dist-info/METADATA,sha256=AmvKK-9jDxFly93v2XT9WpmdU6n1jEPHCw7CgHr7ktM,608
3
+ PyamilySeq-1.0.0.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
4
+ PyamilySeq-1.0.0.dist-info/entry_points.txt,sha256=Ip84PS-IG05XWHiA98MiXE9AJVmqTa5O7BQ2cywrDoo,49
5
+ PyamilySeq-1.0.0.dist-info/top_level.txt,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
6
+ PyamilySeq-1.0.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.1.0)
2
+ Generator: setuptools (75.3.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ eval = rORForise.evaluate:main
@@ -0,0 +1 @@
1
+
PyamilySeq/Constants.py DELETED
@@ -1,2 +0,0 @@
1
- PyamilySeq_Version = 'v0.8.1'
2
-
@@ -1,350 +0,0 @@
1
- import subprocess
2
- import os
3
- import argparse
4
- from collections import defaultdict, OrderedDict
5
- from line_profiler_pycharm import profile
6
-
7
- try:
8
- from .Constants import *
9
- from .utils import *
10
- except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
11
- from Constants import *
12
- from utils import *
13
-
14
- def run_cd_hit(options, input_file, clustering_output, clustering_mode):
15
- cdhit_command = [
16
- clustering_mode,
17
- '-i', input_file,
18
- '-o', clustering_output,
19
- '-c', str(options.pident),
20
- '-s', str(options.len_diff),
21
- '-T', str(options.clustering_threads),
22
- '-M', str(options.clustering_memory),
23
- '-d', "0",
24
- '-sc', "1",
25
- '-sf', "1"
26
- ]
27
- if options.verbose:
28
- subprocess.run(cdhit_command)
29
- else:
30
- subprocess.run(cdhit_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
31
-
32
-
33
- def calculate_new_rep_seq(cluster_data):
34
- total_length = sum(entry['length'] for entry in cluster_data)
35
- avg_length = total_length / len(cluster_data)
36
-
37
- total_identity = sum(entry['percent_identity'] for entry in cluster_data)
38
- avg_identity = total_identity / len(cluster_data)
39
-
40
- # Calculate a score based on both length difference and percent identity
41
- def score(entry):
42
- length_diff = abs(entry['length'] - avg_length)
43
- identity_diff = abs(entry['percent_identity'] - avg_identity)
44
- return length_diff + (100 - identity_diff) # You can weight these differently
45
-
46
- rep_entry = min(cluster_data, key=score)
47
- return rep_entry
48
-
49
-
50
- def length_within_threshold(rep_length, length, len_diff):
51
- return abs(rep_length - length) / rep_length <= len_diff
52
-
53
-
54
- def check_if_all_identical(clustered_sequences):
55
- lengths = {entry['length'] for cluster in clustered_sequences.values() for entry in cluster}
56
- perc_idents = {entry['percent_identity'] for cluster in clustered_sequences.values() for entry in cluster}
57
-
58
- return len(lengths) == 1 and len(perc_idents) == 1
59
-
60
-
61
- def read_fasta_groups(fasta_file):
62
- groups = defaultdict(list)
63
- genome_count = defaultdict(int)
64
- current_group = None
65
- current_sequence = []
66
-
67
- with open(fasta_file, 'r') as f:
68
- for line in f:
69
- if line.startswith('>'):
70
- if current_group is not None:
71
- groups[current_group].append((current_group_header, ''.join(current_sequence)))
72
-
73
- current_group_header = line.strip()
74
- current_group = current_group_header.split('|')[0]
75
- genome = current_group_header.split('|')[1]
76
- current_sequence = []
77
- genome_count[genome] += 1
78
- else:
79
- current_sequence.append(line.strip())
80
-
81
- if current_group is not None:
82
- groups[current_group].append((current_group_header, ''.join(current_sequence)))
83
-
84
- return groups, genome_count
85
-
86
-
87
- def write_fasta(sequences, output_file):
88
- with open(output_file, 'w') as f:
89
- for header, seq in sequences:
90
- f.write(f"{header}\n{seq}\n")
91
-
92
-
93
- def read_cd_hit_output(clustering_output):
94
- clusters = OrderedDict()
95
-
96
- with open(clustering_output, 'r') as f:
97
- current_cluster_id = None
98
-
99
- for line in f:
100
- line = line.strip()
101
- if line.startswith(">Cluster"):
102
- current_cluster_id = line.split(' ')[1]
103
- clusters[current_cluster_id] = []
104
- elif line and current_cluster_id is not None:
105
- parts = line.split('\t')
106
- if len(parts) > 1:
107
- clustered_info = parts[1]
108
- length = clustered_info.split(',')[0]
109
- length = int(''.join(c for c in length if c.isdigit()))
110
- clustered_header = clustered_info.split('>')[1].split('...')[0]
111
- clustered_header = '>' + clustered_header
112
-
113
- if 'at' in clustered_info:
114
- percent_identity = extract_identity(line)
115
-
116
- elif '*' in line:
117
- percent_identity = 100.0
118
- else:
119
- raise ValueError("Percent identity not found in the string.")
120
-
121
- clusters[current_cluster_id].append({
122
- 'header': clustered_header,
123
- 'length': length,
124
- 'percent_identity': percent_identity
125
- })
126
-
127
- return clusters
128
-
129
-
130
- def separate_groups(input_fasta, options, clustering_mode):
131
- groups, genome_count = read_fasta_groups(input_fasta)
132
-
133
- paralog_groups = defaultdict(int) # To track number of paralog groups
134
-
135
- for group_header, sequences in groups.items():
136
- group_name = group_header.split('|')[0] # Get the group part (e.g., '>Group_n')
137
-
138
- # Count genomes with more than one gene
139
- genome_to_gene_count = defaultdict(int)
140
- for header, _ in sequences:
141
- genome = header.split('|')[1]
142
- genome_to_gene_count[genome] += 1
143
-
144
- num_genomes_with_multiple_genes = sum(1 for count in genome_to_gene_count.values() if count > 1)
145
- total_genomes = len(genome_to_gene_count)
146
-
147
- # Check if the group meets the threshold for having paralogs
148
- if total_genomes == 0 or (num_genomes_with_multiple_genes / total_genomes) * 100 < options.percent_threshold:
149
- continue
150
-
151
- group_file_name = group_name.replace('>','')
152
-
153
- temp_fasta = f"{options.output_dir}/{group_file_name}.fasta"
154
- write_fasta(sequences, temp_fasta)
155
-
156
- # Run cd-hit on the individual group
157
- clustering_output = f"{options.output_dir}/{group_file_name}_clustering"
158
-
159
- run_cd_hit(options, temp_fasta, clustering_output, clustering_mode)
160
-
161
- # Read the clustering results to find subgroups
162
- clustered_sequences = read_cd_hit_output(clustering_output + '.clstr')
163
-
164
- # Detect if all sequences are identical in length and percentage identity
165
- all_same = check_if_all_identical(clustered_sequences)
166
-
167
- # **Global subgroup counter for the entire major group**
168
- subgroup_id = 0
169
- remaining_sequences = sequences.copy() # Track unprocessed sequences
170
- sequences_to_remove = []
171
-
172
- if not all_same:
173
- while remaining_sequences:
174
- # Track subgroups for this pass
175
- subgroup_sequences = []
176
- genome_seen = set()
177
- sequences_found = False # Track if any sequence was added
178
-
179
- # Recalculate representative sequence dynamically based on remaining genes
180
- rep = calculate_new_rep_seq(
181
- [entry for cluster in clustered_sequences.values() for entry in cluster if
182
- entry['header'] in (h for h, _ in remaining_sequences)]
183
- )
184
-
185
- # Find the sequence corresponding to rep['header'] from the list of sequences
186
- rep_seq = next((seq for header, seq in sequences if header == rep['header']), None)
187
-
188
- # Process each genome to select the best matching sequence
189
- for genome in genome_to_gene_count:
190
- best_sequence = None
191
- best_score = -1 # Initialize with a very low similarity score
192
-
193
- # Iterate over each sequence in the remaining sequences for this genome
194
- for header, seq in remaining_sequences:
195
- genome_id = header.split('|')[1]
196
-
197
- if genome_id == genome: # Ensure this sequence belongs to the current genome
198
-
199
- length = len(seq)
200
- if rep_seq == seq:
201
- perc_ident = 100.0
202
- else:
203
- perc_ident = calculate_similarity(rep_seq, seq) # Define a function to calculate similarity
204
-
205
- # Calculate the length difference ratio (smaller ratio means closer length to the representative)
206
- length_diff_ratio = abs(rep['length'] - length) / rep['length']
207
-
208
- # Check if this sequence is more similar than the current best one
209
- if length_within_threshold(rep['length'], length,
210
- options.len_diff) and perc_ident >= options.pident:
211
-
212
- # Combine percentage identity and length difference into a single score
213
- # Here, you want a high identity and a small length difference
214
- # Adjust the weight of length difference and similarity according to your requirements
215
- score = perc_ident - (length_diff_ratio * 100) # Weighting length diff (you can adjust the *100 factor)
216
-
217
- # Check if this sequence has a higher score than the current best
218
- if score > best_score:
219
- best_score = score
220
- best_sequence = (header, seq) # Store the best matching sequence for this genome
221
-
222
- # Once the best sequence is identified, add it to the subgroup
223
- if best_sequence is not None:
224
- sequences_found = True # At least one sequence was added
225
- new_header = f">{group_file_name}_subgroup_{subgroup_id}|{best_sequence[0].split('|')[1]}|{best_sequence[0].split('|')[2]}"
226
- subgroup_sequences.append((new_header, best_sequence[1]))
227
- sequences_to_remove.append(best_sequence)
228
- genome_seen.add(genome)
229
-
230
- # If no sequences were found for this pass, exit the loop
231
- # if not sequences_found:
232
- # break
233
-
234
- # Write each subgroup into a separate FASTA file
235
- if subgroup_sequences:
236
- subgroup_file = f"{options.output_dir}/{group_file_name}_subgroup_{subgroup_id}.fasta"
237
- write_fasta(subgroup_sequences, subgroup_file)
238
-
239
- # Remove processed sequences from the remaining list
240
- remaining_sequences = [item for item in remaining_sequences if
241
- item[0] not in {h for h, _ in sequences_to_remove}]
242
-
243
- # Increment subgroup ID globally for the next subgroup
244
- subgroup_id += 1
245
- paralog_groups[group_name] += 1 # Count this group as a paralog group
246
-
247
-
248
- else:
249
- # Condition 2: If sequences are identical, distribute genes evenly into subgroups
250
- num_subgroups = 1000
251
- subgroup_sequences = defaultdict(list) # Store sequences for each subgroup
252
- genome_count = defaultdict(int) # Count how many genes have been assigned to each genome
253
-
254
- # Iterate over all sequences regardless of whether the genome has been seen
255
- for header, seq in sequences:
256
- genome = header.split('|')[1]
257
-
258
- # Determine the next subgroup for this genome
259
- subgroup_id = genome_count[genome] % num_subgroups
260
- new_header = f"{group_file_name}_subgroup_{subgroup_id}|{genome}|{header.split('|')[2]}"
261
- subgroup_sequences[subgroup_id].append((new_header, seq))
262
-
263
- # Increment the count for this genome
264
- genome_count[genome] += 1
265
-
266
- # Write out each subgroup to a separate FASTA file
267
- for subgroup_id, seqs in subgroup_sequences.items():
268
- subgroup_file = f"{options.output_dir}/{group_file_name}_subgroup_{subgroup_id}.fasta"
269
- write_fasta(seqs, subgroup_file)
270
-
271
- # Increment subgroup ID globally for the next subgroup
272
- subgroup_id += 1
273
- paralog_groups[group_name] += 1 # Count this group as a paralog group
274
-
275
-
276
-
277
- # Clean up temporary fasta file if the option is set
278
- if options.delete_temp_files:
279
- if temp_fasta and os.path.exists(temp_fasta):
280
- os.remove(temp_fasta)
281
- if os.path.exists(clustering_output + '.clstr'):
282
- os.remove(clustering_output + '.clstr')
283
- if os.path.exists(clustering_output):
284
- os.remove(clustering_output)
285
-
286
- # Print metrics about paralog groups
287
- print(f"Identified {len(paralog_groups)} paralog groups:")
288
- for group_id, count in paralog_groups.items():
289
- print(f"Group ID: {group_id}, Number of new groups: {count}")
290
-
291
-
292
- def main():
293
- parser = argparse.ArgumentParser(description='Group-Splitter: ' + PyamilySeq_Version + ': A tool to split "paralogous" groups identified by PyamilySeq.')
294
- ### Required Arguments
295
- required = parser.add_argument_group('Required Arguments')
296
- required.add_argument('-input_fasta', action='store', dest='input_fasta',
297
- help='Input FASTA file containing gene groups.',
298
- required=True)
299
- required.add_argument('-sequence_type', action='store', dest='sequence_type', default='DNA',choices=['AA', 'DNA'],
300
- help='Default - DNA: Are groups "DNA" or "AA" sequences?',
301
- required=False)
302
- required.add_argument('-output_dir', action='store', dest='output_dir',
303
- help='Output directory.',
304
- required=True)
305
-
306
- optional = parser.add_argument_group('Optional Arguments')
307
-
308
- optional.add_argument('-pident', action='store', dest='pident', type=float, default=0.9,
309
- help='Sequence identity threshold (default: 0.9)')
310
- optional.add_argument('-len_diff', action='store', dest='len_diff', type=float, default=0.05,
311
- help='Length difference threshold (default: 0.05)')
312
- optional.add_argument('-clustering_threads', action='store', dest='clustering_threads', type=int, default=4,
313
- help='Number of threads for clustering (default: 4)')
314
- optional.add_argument('-clustering_memory', action='store', dest='clustering_memory', type=int, default=2000,
315
- help='Memory limit in MB for clustering (default: 2000)')
316
- optional.add_argument('-percent_threshold', action='store', dest='percent_threshold', type=float, default=80,
317
- help='Minimum percentage of genomes with paralogs (default: 80.0)')
318
- optional.add_argument('-verbose', action='store_true', dest='verbose', help='Print verbose output.')
319
- optional.add_argument('-no_delete_temp_files', action='store_false', dest='delete_temp_files',
320
- help='Default: Delete all temporary files after processing.')
321
-
322
- misc = parser.add_argument_group('Misc Arguments')
323
- misc.add_argument('-v', action='store_true', dest='version',
324
- help='Print out version number and exit',
325
- required=False)
326
-
327
- options = parser.parse_args()
328
-
329
- # Check for version flag
330
- if options.version:
331
- print(f"Group-Splitter version {PyamilySeq_Version}")
332
- exit(0)
333
-
334
- options = parser.parse_args()
335
-
336
- if not os.path.exists(options.output_dir):
337
- os.makedirs(options.output_dir)
338
-
339
- if options.sequence_type == 'DNA':
340
- clustering_mode = 'cd-hit-est'
341
- else:
342
- clustering_mode = 'cd-hit'
343
-
344
- separate_groups(options.input_fasta, options, clustering_mode)
345
-
346
- print("Done")
347
-
348
-
349
- if __name__ == "__main__":
350
- main()