PyamilySeq 0.8.0__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
PyamilySeq/Constants.py CHANGED
@@ -1,2 +1,2 @@
1
- PyamilySeq_Version = 'v0.8.0'
1
+ PyamilySeq_Version = 'v0.8.1'
2
2
 
@@ -110,11 +110,13 @@ def read_cd_hit_output(clustering_output):
110
110
  clustered_header = clustered_info.split('>')[1].split('...')[0]
111
111
  clustered_header = '>' + clustered_header
112
112
 
113
- if 'at +' in clustered_info:
114
- percent_identity = float(clustered_info.split('at +/')[1].strip().replace('%', ''))
113
+ if 'at' in clustered_info:
114
+ percent_identity = extract_identity(line)
115
115
 
116
- if '*' in line:
116
+ elif '*' in line:
117
117
  percent_identity = 100.0
118
+ else:
119
+ raise ValueError("Percent identity not found in the string.")
118
120
 
119
121
  clusters[current_cluster_id].append({
120
122
  'header': clustered_header,
@@ -130,7 +132,6 @@ def separate_groups(input_fasta, options, clustering_mode):
130
132
 
131
133
  paralog_groups = defaultdict(int) # To track number of paralog groups
132
134
 
133
-
134
135
  for group_header, sequences in groups.items():
135
136
  group_name = group_header.split('|')[0] # Get the group part (e.g., '>Group_n')
136
137
 
@@ -149,11 +150,12 @@ def separate_groups(input_fasta, options, clustering_mode):
149
150
 
150
151
  group_file_name = group_name.replace('>','')
151
152
 
152
- temp_fasta = f"{options.output_dir}{group_file_name}.fasta"
153
+ temp_fasta = f"{options.output_dir}/{group_file_name}.fasta"
153
154
  write_fasta(sequences, temp_fasta)
154
155
 
155
156
  # Run cd-hit on the individual group
156
157
  clustering_output = f"{options.output_dir}/{group_file_name}_clustering"
158
+
157
159
  run_cd_hit(options, temp_fasta, clustering_output, clustering_mode)
158
160
 
159
161
  # Read the clustering results to find subgroups
@@ -255,7 +257,7 @@ def separate_groups(input_fasta, options, clustering_mode):
255
257
 
256
258
  # Determine the next subgroup for this genome
257
259
  subgroup_id = genome_count[genome] % num_subgroups
258
- new_header = f"{options.output_dir}/{group_file_name}_subgroup_{subgroup_id}|{genome}|{header.split('|')[2]}"
260
+ new_header = f"{group_file_name}_subgroup_{subgroup_id}|{genome}|{header.split('|')[2]}"
259
261
  subgroup_sequences[subgroup_id].append((new_header, seq))
260
262
 
261
263
  # Increment the count for this genome
@@ -266,6 +268,12 @@ def separate_groups(input_fasta, options, clustering_mode):
266
268
  subgroup_file = f"{options.output_dir}/{group_file_name}_subgroup_{subgroup_id}.fasta"
267
269
  write_fasta(seqs, subgroup_file)
268
270
 
271
+ # Increment subgroup ID globally for the next subgroup
272
+ subgroup_id += 1
273
+ paralog_groups[group_name] += 1 # Count this group as a paralog group
274
+
275
+
276
+
269
277
  # Clean up temporary fasta file if the option is set
270
278
  if options.delete_temp_files:
271
279
  if temp_fasta and os.path.exists(temp_fasta):
@@ -288,6 +296,9 @@ def main():
288
296
  required.add_argument('-input_fasta', action='store', dest='input_fasta',
289
297
  help='Input FASTA file containing gene groups.',
290
298
  required=True)
299
+ required.add_argument('-sequence_type', action='store', dest='sequence_type', default='DNA',choices=['AA', 'DNA'],
300
+ help='Default - DNA: Are groups "DNA" or "AA" sequences?',
301
+ required=False)
291
302
  required.add_argument('-output_dir', action='store', dest='output_dir',
292
303
  help='Output directory.',
293
304
  required=True)
@@ -305,8 +316,8 @@ def main():
305
316
  optional.add_argument('-percent_threshold', action='store', dest='percent_threshold', type=float, default=80,
306
317
  help='Minimum percentage of genomes with paralogs (default: 80.0)')
307
318
  optional.add_argument('-verbose', action='store_true', dest='verbose', help='Print verbose output.')
308
- optional.add_argument('-delete_temp_files', action='store_true', dest='delete_temp_files',
309
- help='Delete all temporary files after processing.')
319
+ optional.add_argument('-no_delete_temp_files', action='store_false', dest='delete_temp_files',
320
+ help='Default: Delete all temporary files after processing.')
310
321
 
311
322
  misc = parser.add_argument_group('Misc Arguments')
312
323
  misc.add_argument('-v', action='store_true', dest='version',
@@ -325,7 +336,11 @@ def main():
325
336
  if not os.path.exists(options.output_dir):
326
337
  os.makedirs(options.output_dir)
327
338
 
328
- clustering_mode = 'cd-hit-est'
339
+ if options.sequence_type == 'DNA':
340
+ clustering_mode = 'cd-hit-est'
341
+ else:
342
+ clustering_mode = 'cd-hit'
343
+
329
344
  separate_groups(options.input_fasta, options, clustering_mode)
330
345
 
331
346
  print("Done")
PyamilySeq/PyamilySeq.py CHANGED
@@ -27,7 +27,7 @@ def run_cd_hit(options, input_file, clustering_output, clustering_mode):
27
27
  '-o', clustering_output,
28
28
  '-c', str(options.pident),
29
29
  '-s', str(options.len_diff),
30
- '-T', str(options.clustering_threads),
30
+ '-T', str(options.threads),
31
31
  '-M', str(options.clustering_memory),
32
32
  '-d', "0",
33
33
  '-sc', "1",
@@ -84,8 +84,8 @@ def main():
84
84
  clustering_args.add_argument("-mem", action="store", dest="clustering_memory", type=int, default=4000,
85
85
  help="Default 4000: Memory to be allocated for clustering (in MBs).",
86
86
  required=False)
87
- clustering_args.add_argument("-t", action="store", dest="clustering_threads", type=int, default=4,
88
- help="Default 4: Threads to be allocated for clustering.",
87
+ clustering_args.add_argument("-t", action="store", dest="threads", type=int, default=8,
88
+ help="Default 8: Threads to be allocated for clustering and/or alignment.",
89
89
  required=False)
90
90
 
91
91
  ###Partial-Mode Arguments
@@ -130,8 +130,9 @@ def main():
130
130
 
131
131
  ### Misc Arguments
132
132
  misc = parser.add_argument_group('Misc')
133
- misc.add_argument('-verbose', action='store_true', dest='verbose', default=None, help='Default - False: Print out runtime messages',
134
- required = False)
133
+ misc.add_argument('-verbose', action='store_true', dest='verbose', default=None,
134
+ help='Default - False: Print out runtime messages',
135
+ required = False)
135
136
  misc.add_argument('-v', action='store_true', dest='version',
136
137
  help='Default - False: Print out version number and exit',
137
138
  required=False)
@@ -254,6 +255,7 @@ def main():
254
255
  self.output_dir = options.output_dir
255
256
  self.gene_presence_absence_out = options.gene_presence_absence_out
256
257
  self.write_groups = options.write_groups
258
+ self.threads = options.threads
257
259
  self.align_core = options.align_core
258
260
  self.fasta = combined_out_file
259
261
  self.verbose = options.verbose
@@ -272,6 +274,7 @@ def main():
272
274
  self.output_dir = options.output_dir
273
275
  self.gene_presence_absence_out = options.gene_presence_absence_out
274
276
  self.write_groups = options.write_groups
277
+ self.threads = options.threads
275
278
  self.align_core = options.align_core
276
279
  self.fasta = options.original_fasta
277
280
  self.verbose = options.verbose
PyamilySeq/utils.py CHANGED
@@ -6,6 +6,7 @@ import collections
6
6
  from tempfile import NamedTemporaryFile
7
7
  import sys
8
8
  from line_profiler_pycharm import profile
9
+ import re
9
10
 
10
11
 
11
12
  ################### We are currently fixed using Table 11
@@ -110,12 +111,23 @@ def reverse_complement(seq):
110
111
  complement = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'N': 'N'}
111
112
  return ''.join(complement[base] for base in reversed(seq))
112
113
 
114
+
113
115
  def fix_path(path):
114
116
  fixed_path = os.path.normpath(path)
115
117
  fixed_path = os.path.realpath(fixed_path)
116
118
  return fixed_path
117
119
 
118
120
 
121
+ def extract_identity(clustered_info):
122
+ # Use regular expressions to capture the percentage value at the end of the line
123
+ match = re.search(r'at ([-+]*)(\d+\.\d+)%', clustered_info)
124
+
125
+ if match:
126
+ percent_identity = float(match.group(2)) # Extract the percentage value
127
+ return percent_identity
128
+ else:
129
+ raise ValueError("Percent identity not found in the string.")
130
+
119
131
  def wrap_sequence(sequence, width=60):
120
132
  wrapped_sequence = []
121
133
  for i in range(0, len(sequence), width):
@@ -172,14 +184,15 @@ def run_mafft_on_sequences(options, sequences, output_file):
172
184
  with open(output_file, 'w') as output_f:
173
185
  if options.verbose == True:
174
186
  subprocess.run(
175
- ['mafft', '--auto', temp_input_file_path],
187
+ ['mafft', '--auto', '--thread', str(options.threads), temp_input_file_path],
176
188
  stdout=output_f,
177
189
  stderr=sys.stderr,
178
190
  check=True
179
191
  )
192
+
180
193
  else:
181
194
  subprocess.run(
182
- ['mafft', '--auto', temp_input_file_path],
195
+ ['mafft', '--auto', '--thread', str(options.threads), temp_input_file_path],
183
196
  stdout=output_f,
184
197
  stderr=subprocess.DEVNULL, # Suppress stderr
185
198
  check=True
@@ -385,7 +398,7 @@ def process_gene_families(options, directory, output_file):
385
398
 
386
399
  # Iterate over each gene family file
387
400
  for gene_file in os.listdir(directory):
388
- if gene_file.endswith('.fasta'):
401
+ if gene_file.endswith('.fasta') and not gene_file.endswith('combined_group_sequences.fasta'):
389
402
  gene_path = os.path.join(directory, gene_file)
390
403
 
391
404
  # Read sequences from the gene family file
@@ -395,13 +408,15 @@ def process_gene_families(options, directory, output_file):
395
408
  longest_sequences = select_longest_gene(sequences)
396
409
 
397
410
  # Run mafft on the longest sequences
398
- aligned_file = f"{gene_file}_aligned.fasta"
411
+ aligned_file = f"{directory}/{gene_file}_aligned.fasta.tmp"
399
412
  run_mafft_on_sequences(options, {seq_id: seq for seq_id, seq in longest_sequences.values()}, aligned_file)
400
413
 
401
414
  # Read aligned sequences and concatenate them
402
415
  aligned_sequences = read_fasta(aligned_file)
403
416
  for genome, aligned_seq in aligned_sequences.items():
404
417
  genome_name = genome.split('|')[0]
418
+ if 'Group' in genome_name:
419
+ print(2)
405
420
  if genome_name not in concatenated_sequences:
406
421
  concatenated_sequences[genome_name] = ""
407
422
  concatenated_sequences[genome_name] += aligned_seq
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PyamilySeq
3
- Version: 0.8.0
3
+ Version: 0.8.1
4
4
  Summary: PyamilySeq - A a tool to look for sequence-based gene groups identified by clustering methods such as CD-HIT, DIAMOND, BLAST or MMseqs2.
5
5
  Home-page: https://github.com/NickJD/PyamilySeq
6
6
  Author: Nicholas Dimonaco
@@ -58,7 +58,7 @@ Escherichia_coli_110957|ENSB:TIZS9kbTvShDvyX Escherichia_coli_110957|ENSB:TIZS9k
58
58
  ```
59
59
  ### Example output:
60
60
  ```
61
- Running PyamilySeq v0.8.0
61
+ Running PyamilySeq v0.8.1
62
62
  Calculating Groups
63
63
  Gene Groups:
64
64
  First_core_99: 2682
@@ -80,7 +80,7 @@ PyamilySeq -run_mode Partial -group_mode Genus -clustering_format CD-HIT -output
80
80
  -cluster_file .../test_data/genus/CD-HIT/combined_cds_cd-hit_80_60.clstr -gpa
81
81
  ```
82
82
  ```commandline
83
- Running PyamilySeq v0.8.0
83
+ Running PyamilySeq v0.8.1
84
84
  Calculating Groups
85
85
  Genus Groups:
86
86
  First_genera_1: 28549
@@ -137,14 +137,14 @@ Please report any issues to: https://github.com/NickJD/PyamilySeq/issues
137
137
  ## PyamilySeq - Menu:
138
138
  ### PyamilySeq is separated into two main 'run modes', Full and Partial. They each have their own set of required and optional arguments.
139
139
  ```
140
- Running PyamilySeq v0.8.0
140
+ Running PyamilySeq v0.8.1
141
141
  usage: PyamilySeq.py [-h] -run_mode {Full,Partial} -group_mode {Species,Genus} -clustering_format {CD-HIT,TSV,CSV} -output_dir OUTPUT_DIR
142
142
  [-input_type {separate,combined}] [-input_dir INPUT_DIR] [-name_split NAME_SPLIT] [-sequence_type {AA,DNA}] [-gene_ident GENE_IDENT]
143
143
  [-pid PIDENT] [-len_diff LEN_DIFF] [-mem CLUSTERING_MEMORY] [-t CLUSTERING_THREADS] [-cluster_file CLUSTER_FILE]
144
144
  [-reclustered RECLUSTERED] [-seq_tag SEQUENCE_TAG] [-core_groups CORE_GROUPS] [-genus_groups GENUS_GROUPS] [-w WRITE_GROUPS] [-a]
145
145
  [-original_fasta ORIGINAL_FASTA] [-gpa] [-verbose] [-v]
146
146
 
147
- PyamilySeq v0.8.0: A tool that groups genes into unique clusters.
147
+ PyamilySeq v0.8.1: A tool that groups genes into unique clusters.
148
148
 
149
149
  options:
150
150
  -h, --help show this help message and exit
@@ -176,8 +176,9 @@ Full-Mode Arguments - Required when "-run_mode Full" is used:
176
176
  Clustering Runtime Arguments - Optional when "-run_mode Full" is used:
177
177
  -mem CLUSTERING_MEMORY
178
178
  Default 4000: Memory to be allocated for clustering (in MBs).
179
- -t CLUSTERING_THREADS
180
- Default 4: Threads to be allocated for clustering.
179
+ -t THREADS Default 8: Threads to be allocated for clustering
180
+ and/or alignment.
181
+
181
182
 
182
183
  Partial-Mode Arguments - Required when "-run_mode Partial" is used:
183
184
  -cluster_file CLUSTER_FILE
@@ -221,7 +222,7 @@ Seq-Combiner -input_dir .../test_data/genomes -name_split _combined.gff3 -output
221
222
  ```
222
223
  usage: Seq_Combiner.py [-h] -input_dir INPUT_DIR -input_type {separate,combined,fasta} -name_split NAME_SPLIT -output_dir OUTPUT_DIR -output_name OUTPUT_FILE [-gene_ident GENE_IDENT] [-translate] [-v]
223
224
 
224
- Seq-Combiner v0.8.0: A tool to extract sequences from GFF/FASTA files.
225
+ Seq-Combiner v0.8.1: A tool to extract sequences from GFF/FASTA files.
225
226
 
226
227
  options:
227
228
  -h, --help show this help message and exit
@@ -254,7 +255,7 @@ Misc Arguments:
254
255
  usage: Group_Splitter.py [-h] -input_fasta INPUT_FASTA -output_dir OUTPUT_DIR [-pident PIDENT] [-len_diff LEN_DIFF] [-clustering_threads CLUSTERING_THREADS]
255
256
  [-clustering_memory CLUSTERING_MEMORY] [-percent_threshold PERCENT_THRESHOLD] [-verbose] [-delete_temp_files] [-v]
256
257
 
257
- Group-Splitter: v0.8.0: A tool to split "paralogous" groups identified by PyamilySeq.
258
+ Group-Splitter: v0.8.1: A tool to split "paralogous" groups identified by PyamilySeq.
258
259
 
259
260
  options:
260
261
  -h, --help show this help message and exit
@@ -262,6 +263,8 @@ options:
262
263
  Required Arguments:
263
264
  -input_fasta INPUT_FASTA
264
265
  Input FASTA file containing gene groups.
266
+ -sequence_type {AA,DNA}
267
+ Default - DNA: Are groups "DNA" or "AA" sequences?
265
268
  -output_dir OUTPUT_DIR
266
269
  Output directory.
267
270
 
@@ -0,0 +1,15 @@
1
+ PyamilySeq/Constants.py,sha256=J_jZheqHCbmFVCLrY8nMe4T5VZQOQ7PbT_HmYSi58WM,31
2
+ PyamilySeq/Group_Splitter.py,sha256=wrz-vcQ2gJ40MLLczFY8te35_uYrOBuh2v-fJSIVsWo,15578
3
+ PyamilySeq/PyamilySeq.py,sha256=OAtz6b7dnvA-Qg0dnf2JXImiOtsDrDfVit7Q6DFbuPU,15265
4
+ PyamilySeq/PyamilySeq_Genus.py,sha256=hC34cHIFu8YaXXgcPyVwuWENlsxx-7mT-Qr6PAdio4U,12414
5
+ PyamilySeq/PyamilySeq_Species.py,sha256=spgS-h-lrySZBiOiB6jX6pPRaL5j8f5V1Hq3XOjBOko,14404
6
+ PyamilySeq/Seq_Combiner.py,sha256=dPDu6LlT3B-ZDn3wKZ3AeWraDgv2Tub_16l9CLc3tQ0,3353
7
+ PyamilySeq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ PyamilySeq/clusterings.py,sha256=rcWFv0IiWoS4aUNRjDDwNEL86l1wIKa4vK4htAxy8Hg,18787
9
+ PyamilySeq/utils.py,sha256=vjPSIua4E72JTWlzH4CUaRcR-Z6Nr-RQ9N_92tfZI_w,19686
10
+ PyamilySeq-0.8.1.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
11
+ PyamilySeq-0.8.1.dist-info/METADATA,sha256=weIjFQkc7ggqkPlPkSA5an8eFiUzhDyxGl9t7-rJPsA,14555
12
+ PyamilySeq-0.8.1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
13
+ PyamilySeq-0.8.1.dist-info/entry_points.txt,sha256=15BsozBN6vRWvZeQon05dY4YQT7DqP5i2TUqFWRGCvc,150
14
+ PyamilySeq-0.8.1.dist-info/top_level.txt,sha256=J6JhugUQTq4rq96yibAlQu3o4KCM9WuYfqr3w1r119M,11
15
+ PyamilySeq-0.8.1.dist-info/RECORD,,
@@ -1,15 +0,0 @@
1
- PyamilySeq/Constants.py,sha256=lbVZv4vDHroA83KCDTIGuVb6bubKYZbwLmhYHxedXQc,31
2
- PyamilySeq/Group_Splitter.py,sha256=raZMV9SN7Qqw5Hci5qpkaahR66JMQf6dX8TvThjh3kU,14986
3
- PyamilySeq/PyamilySeq.py,sha256=0607A9nqafoQ8IhBxGgGJ-v3DVV6C6-LgzdDIXb2C-c,15179
4
- PyamilySeq/PyamilySeq_Genus.py,sha256=hC34cHIFu8YaXXgcPyVwuWENlsxx-7mT-Qr6PAdio4U,12414
5
- PyamilySeq/PyamilySeq_Species.py,sha256=spgS-h-lrySZBiOiB6jX6pPRaL5j8f5V1Hq3XOjBOko,14404
6
- PyamilySeq/Seq_Combiner.py,sha256=dPDu6LlT3B-ZDn3wKZ3AeWraDgv2Tub_16l9CLc3tQ0,3353
7
- PyamilySeq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- PyamilySeq/clusterings.py,sha256=rcWFv0IiWoS4aUNRjDDwNEL86l1wIKa4vK4htAxy8Hg,18787
9
- PyamilySeq/utils.py,sha256=6UtYJW3_0rDhEhvrJi6R3smvKu2n_bjqUkuzr5DcJM4,19061
10
- PyamilySeq-0.8.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
11
- PyamilySeq-0.8.0.dist-info/METADATA,sha256=ZnpQvAQy5EXGrzS0G9y5qH2Rhmb0LW2HvOT-b5WJLoo,14436
12
- PyamilySeq-0.8.0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
13
- PyamilySeq-0.8.0.dist-info/entry_points.txt,sha256=15BsozBN6vRWvZeQon05dY4YQT7DqP5i2TUqFWRGCvc,150
14
- PyamilySeq-0.8.0.dist-info/top_level.txt,sha256=J6JhugUQTq4rq96yibAlQu3o4KCM9WuYfqr3w1r119M,11
15
- PyamilySeq-0.8.0.dist-info/RECORD,,