PyamilySeq 0.9.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
PyamilySeq/utils.py CHANGED
@@ -5,8 +5,8 @@ import glob
5
5
  import collections
6
6
  from tempfile import NamedTemporaryFile
7
7
  import sys
8
- from line_profiler_pycharm import profile
9
8
  import re
9
+ import math
10
10
 
11
11
  ####
12
12
  # Placeholder for the distance function
@@ -44,7 +44,7 @@ except (ModuleNotFoundError, ImportError):
44
44
  #####
45
45
 
46
46
  ################### We are currently fixed using Table 11
47
- gencode = {
47
+ codon_table = {
48
48
  'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
49
49
  'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
50
50
  'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
@@ -63,10 +63,44 @@ gencode = {
63
63
  'TGC':'C', 'TGT':'C', 'TGA':'*', 'TGG':'W'}
64
64
 
65
65
  def translate_frame(sequence):
66
- translate = ''.join([gencode.get(sequence[3 * i:3 * i + 3], 'X') for i in range(len(sequence) // 3)])
66
+ translate = ''.join([codon_table.get(sequence[3 * i:3 * i + 3], 'X') for i in range(len(sequence) // 3)])
67
67
  return translate
68
68
 
69
+ def translate_dna_to_aa(dna_fasta, aa_fasta):
70
+ def translate_dna_sequence(dna_seq):
71
+ aa_seq = ""
72
+ for i in range(0, len(dna_seq) - 2, 3):
73
+ codon = dna_seq[i:i+3]
74
+ aa_seq += codon_table.get(codon, 'X') # 'X' for unknown codons
75
+ return aa_seq
76
+
77
+ with open(dna_fasta, 'r') as infile, open(aa_fasta, 'w') as outfile:
78
+ dna_seq = ""
79
+ header = ""
80
+ for line in infile:
81
+ if line.startswith('>'):
82
+ if dna_seq:
83
+ aa_seq = translate_dna_sequence(dna_seq)
84
+ wrapped_aa_seq = wrap_sequence(aa_seq, 60)
85
+ outfile.write(f"{header}\n{wrapped_aa_seq}\n")
86
+ header = line.strip()
87
+ dna_seq = ""
88
+ else:
89
+ dna_seq += line.strip()
90
+ if dna_seq:
91
+ aa_seq = translate_dna_sequence(dna_seq)
92
+ wrapped_aa_seq = wrap_sequence(aa_seq, 60)
93
+ outfile.write(f"{header}\n{wrapped_aa_seq}\n")
94
+
69
95
 
96
+ def detect_sequence_type(fasta_file):
97
+ with open(fasta_file, 'r') as f:
98
+ for line in f:
99
+ if line.startswith('>'):
100
+ continue
101
+ if any(base in line for base in 'EFILPQZ'):
102
+ return False # Contains amino acids
103
+ return True # Contains DNA
70
104
 
71
105
 
72
106
  def is_tool_installed(tool_name):
@@ -113,6 +147,16 @@ def wrap_sequence(sequence, width=60):
113
147
  return "\n".join(wrapped_sequence)
114
148
 
115
149
 
150
+ def read_genomes_from_fasta(fasta_file):
151
+ genomes = set()
152
+ with open(fasta_file, 'r') as file:
153
+ for line in file:
154
+ line = line.strip()
155
+ if line.startswith('>'):
156
+ genome = line.split('|')[1]
157
+ genomes.add(genome)
158
+ return list(genomes)
159
+
116
160
  def read_fasta(fasta_file):
117
161
  sequences = {}
118
162
  current_sequence = None
@@ -138,11 +182,14 @@ def sort_keys_by_values(dict1, dict2):
138
182
  sorted_keys = sorted(dict1.keys(), key=lambda k: custom_sort_key(k, dict1, dict2), reverse=True)
139
183
  return sorted_keys
140
184
 
141
- def select_longest_gene(sequences):
185
+ def select_longest_gene(sequences, subgrouped):
142
186
  """Select the longest sequence for each genome."""
143
187
  longest_sequences = {}
144
188
  for seq_id, sequence in sequences.items():
145
- genome = seq_id.split('|')[0] # Assuming genome name can be derived from the sequence ID
189
+ if subgrouped == False:
190
+ genome = seq_id.split('|')[0] # Assuming genome name can be derived from the sequence ID
191
+ elif subgrouped == True:
192
+ genome = seq_id.split('|')[1]
146
193
  if genome not in longest_sequences or len(sequence) > len(longest_sequences[genome][1]):
147
194
  longest_sequences[genome] = (seq_id, sequence)
148
195
  return longest_sequences
@@ -182,7 +229,7 @@ def run_mafft_on_sequences(options, sequences, output_file):
182
229
 
183
230
 
184
231
  def read_separate_files(input_dir, name_split, gene_ident, combined_out, translate):
185
- with open(combined_out, 'w') as combined_out_file:
232
+ with open(combined_out, 'w') as combined_out_file, open(combined_out.replace('_dna.fasta','_aa.fasta'), 'w') as combined_out_file_aa:
186
233
  for gff_file in glob.glob(os.path.join(input_dir, '*' + name_split)):
187
234
  genome_name = os.path.basename(gff_file).split(name_split)[0]
188
235
  corresponding_fasta_file = os.path.splitext(gff_file)[0] + '.fa'
@@ -227,20 +274,27 @@ def read_separate_files(input_dir, name_split, gene_ident, combined_out, transla
227
274
  if contig in fasta_dict:
228
275
  if strand == '+':
229
276
  full_sequence = fasta_dict[contig][0]
230
- cds_sequence = full_sequence[start - 1:end]
277
+ seq = full_sequence[start - 1:end]
231
278
  elif strand == '-':
232
279
  corrected_start = max(len(fasta_dict[contig][0]) - int(end), 1)
233
280
  corrected_stop = max(len(fasta_dict[contig][0]) - int(start - 1), 1)
234
281
  full_sequence = fasta_dict[contig][1]
235
- cds_sequence = full_sequence[corrected_start:corrected_stop]
282
+ seq = full_sequence[corrected_start:corrected_stop]
283
+
236
284
  if translate == True:
237
- cds_sequence = translate_frame(cds_sequence)
238
- wrapped_sequence = '\n'.join([cds_sequence[i:i + 60] for i in range(0, len(cds_sequence), 60)])
285
+ seq_aa = translate_frame(seq)
286
+ wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
287
+ combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
288
+ wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
239
289
  combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
240
290
 
291
+ if translate == False:
292
+ #Clean up unused file
293
+ os.remove(combined_out_file_aa.name)
294
+
241
295
 
242
296
  def read_combined_files(input_dir, name_split, gene_ident, combined_out, translate):
243
- with open(combined_out, 'w') as combined_out_file:
297
+ with open(combined_out, 'w') as combined_out_file, open(combined_out.replace('_dna.fasta','_aa.fasta'), 'w') as combined_out_file_aa:
244
298
  for gff_file in glob.glob(os.path.join(input_dir, '*' + name_split)):
245
299
  genome_name = os.path.basename(gff_file).split(name_split)[0]
246
300
  fasta_dict = collections.defaultdict(str)
@@ -284,21 +338,28 @@ def read_combined_files(input_dir, name_split, gene_ident, combined_out, transla
284
338
  if contig in fasta_dict:
285
339
  if strand == '+':
286
340
  full_sequence = fasta_dict[contig][0]
287
- cds_sequence = full_sequence[start - 1:end]
341
+ seq = full_sequence[start - 1:end]
288
342
  elif strand == '-':
289
343
  corrected_start = max(len(fasta_dict[contig][0]) - int(end), 1)
290
344
  corrected_stop = max(len(fasta_dict[contig][0]) - int(start - 1), 1)
291
345
  full_sequence = fasta_dict[contig][1]
292
- cds_sequence = full_sequence[corrected_start:corrected_stop]
346
+ seq = full_sequence[corrected_start:corrected_stop]
293
347
 
294
348
  if translate == True:
295
- cds_sequence = translate_frame(cds_sequence)
296
- wrapped_sequence = '\n'.join([cds_sequence[i:i + 60] for i in range(0, len(cds_sequence), 60)])
349
+ seq_aa = translate_frame(seq)
350
+ wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
351
+ combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
352
+ wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
297
353
  combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
298
354
 
355
+ if translate == False:
356
+ #Clean up unused file
357
+ os.remove(combined_out_file_aa.name)
358
+
359
+
299
360
 
300
361
  def read_fasta_files(input_dir, name_split, combined_out, translate):
301
- with open(combined_out, 'w') as combined_out_file:
362
+ with open(combined_out, 'w') as combined_out_file, open(combined_out.replace('_dna.fasta','_aa.fasta'), 'w') as combined_out_file_aa:
302
363
  for fasta_file in glob.glob(os.path.join(input_dir, '*' + name_split)):
303
364
  genome_name = os.path.basename(fasta_file).split(name_split)[0]
304
365
  fasta_dict = collections.defaultdict(str)
@@ -310,14 +371,19 @@ def read_fasta_files(input_dir, name_split, combined_out, translate):
310
371
  fasta_dict[current_seq] = ''
311
372
  else:
312
373
  fasta_dict[current_seq] +=line.strip()
313
- for id, seq in fasta_dict.items():
374
+ for seq_id, seq in fasta_dict.items():
314
375
  if translate == True:
315
- seq = translate_frame(seq)
376
+ seq_aa = translate_frame(seq)
377
+ wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
378
+ combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
316
379
  wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
317
- combined_out_file.write(f">{genome_name}|{id}\n{wrapped_sequence}\n")
380
+ combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
318
381
 
382
+ if translate == False:
383
+ #Clean up unused file
384
+ os.remove(combined_out_file_aa)
319
385
 
320
- def write_groups(options, output_dir, key_order, cores, sequences,
386
+ def write_groups_func(options, output_dir, key_order, cores, sequences,
321
387
  pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences):
322
388
  """
323
389
  Writes individual FASTA files and a combined FASTA file for all sequences.
@@ -335,74 +401,166 @@ def write_groups(options, output_dir, key_order, cores, sequences,
335
401
  if not os.path.exists(output_dir):
336
402
  os.makedirs(output_dir)
337
403
 
338
- combined_fasta_filename = os.path.join(output_dir, "combined_group_sequences.fasta")
404
+ combined_fasta_filename = os.path.join(output_dir, "combined_group_sequences_dna.fasta")
339
405
 
340
406
  # Open combined FASTA file for writing all sequences
341
- with open(combined_fasta_filename, 'w') as combined_fasta:
407
+ with open(combined_fasta_filename, 'w') as combined_fasta, open(combined_fasta_filename.replace('_dna.fasta','_aa.fasta'), 'w') as combined_fasta_aa:
342
408
  for key_prefix in key_order:
343
409
  for key, values in cores.items():
344
410
  if any(part in options.write_groups.split(',') for part in key.split('_')):
345
411
  if key.startswith(key_prefix):
346
412
  for value in values:
347
- output_filename = f"{key}_{value}.fasta"
413
+ output_filename = f"{key}_{value}_dna.fasta"
348
414
  if 'First' in key_prefix:
349
415
  sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
350
416
  else:
351
417
  sequences_to_write = combined_pangenome_clusters_Second_sequences[value]
352
418
 
353
419
  # Write individual FASTA file
354
- with open(os.path.join(output_dir, output_filename), 'w') as outfile:
420
+ with open(os.path.join(output_dir,output_filename), 'w') as outfile, open(os.path.join(output_dir, output_filename.replace('_dna.fasta','_aa.fasta')), 'w') as outfile_aa:
355
421
  for header in sequences_to_write:
356
422
  if header in sequences:
357
423
  sequence = sequences[header]
358
- outfile.write(f">{header}\n")
359
424
  wrapped_sequence = wrap_sequence(sequence)
360
- outfile.write(f"{wrapped_sequence}\n")
361
-
362
- # Also write to the combined FASTA file
425
+ # Handle Amino Acid Sequences (AA)
426
+ if options.sequence_type == 'AA':
427
+ seq_aa = translate_frame(sequence)
428
+ wrapped_sequence_aa = wrap_sequence(seq_aa)
429
+ # Write individual group file for AA, if option is enabled
430
+ if options.write_individual_groups:
431
+ outfile_aa.write(f">{header}\n")
432
+ outfile_aa.write(f"{wrapped_sequence_aa}\n")
433
+ else:
434
+ os.remove(outfile_aa.name) # Delete individual file if option is disabled
435
+ # Always write to the combined AA file
436
+ combined_fasta_aa.write(f">Group_{value}|{header}\n")
437
+ combined_fasta_aa.write(f"{wrapped_sequence_aa}\n")
438
+ # Handle Nucleotide Sequences
439
+ else:
440
+ # If the option is disabled, delete individual AA file (if created)
441
+ try:
442
+ os.remove(outfile_aa.name) # Ensure outfile_aa is removed when sequence_type isn't 'AA'
443
+ except FileNotFoundError:
444
+ pass
445
+ # Write individual group file for nucleotide sequence, if option is enabled
446
+ if options.write_individual_groups:
447
+ outfile.write(f">{header}\n")
448
+ outfile.write(f"{wrapped_sequence}\n")
449
+ else:
450
+ os.remove(outfile.name) # Delete individual file if option is disabled
451
+ # Always write to the combined nucleotide file
363
452
  combined_fasta.write(f">Group_{value}|{header}\n")
364
453
  combined_fasta.write(f"{wrapped_sequence}\n")
454
+
365
455
  else:
366
- if options.verbose:
456
+ if options.verbose == True:
367
457
  print(f"Sequence {header} not found in original_fasta file.")
368
-
458
+ if options.sequence_type != 'AA':
459
+ #Clean up unused file
460
+ os.remove(combined_fasta_aa.name)
369
461
  print(f"Combined FASTA file saved to: {combined_fasta_filename}")
370
462
 
371
463
 
372
- def process_gene_families(options, directory, output_file):
464
+ # def process_gene_groups(options, group_directory, sub_group_directory, paralog_groups, output_file):
465
+ # """Process each gene family file to select the longest sequence per genome and concatenate aligned sequences."""
466
+ # concatenated_sequences = {}
467
+ # output_file = group_directory.replace('Gene_Groups_Output',output_file)
468
+ #
469
+ # # Iterate over each gene family file
470
+ # for gene_file in os.listdir(group_directory):
471
+ # if gene_file.endswith('.fasta') and not gene_file.endswith('combined_group_sequences.fasta') :
472
+ # gene_path = os.path.join(group_directory, gene_file)
473
+ #
474
+ # # Read sequences from the gene family file
475
+ # sequences = read_fasta(gene_path)
476
+ #
477
+ # # Select the longest sequence for each genome
478
+ # longest_sequences = select_longest_gene(sequences)
479
+ #
480
+ # # Run mafft on the longest sequences
481
+ # aligned_file = f"{group_directory}/{gene_file}_aligned.fasta.tmp"
482
+ # run_mafft_on_sequences(options, {seq_id: seq for seq_id, seq in longest_sequences.values()}, aligned_file)
483
+ #
484
+ # # Read aligned sequences and concatenate them
485
+ # aligned_sequences = read_fasta(aligned_file)
486
+ # for genome, aligned_seq in aligned_sequences.items():
487
+ # genome_name = genome.split('|')[0]
488
+ # if genome_name not in concatenated_sequences:
489
+ # concatenated_sequences[genome_name] = ""
490
+ # concatenated_sequences[genome_name] += aligned_seq
491
+ #
492
+ # # Clean up aligned file
493
+ # os.remove(aligned_file)
494
+ #
495
+ # # Write the concatenated sequences to the output file
496
+ # with open(output_file, 'w') as out:
497
+ # for genome, sequence in concatenated_sequences.items():
498
+ # out.write(f">{genome}\n")
499
+ # wrapped_sequence = wrap_sequence(sequence, 60)
500
+ # out.write(f"{wrapped_sequence}\n")
501
+
502
+ def perform_alignment(gene_path,group_directory, gene_file, options, concatenated_sequences, subgrouped):
503
+ # Read sequences from the gene family file
504
+ sequences = read_fasta(gene_path)
505
+
506
+ # Select the longest sequence for each genome
507
+ longest_sequences = select_longest_gene(sequences, subgrouped)
508
+
509
+ # Run mafft on the longest sequences
510
+ aligned_file = f"{group_directory}/{gene_file}_aligned.fasta.tmp"
511
+ run_mafft_on_sequences(options, {seq_id: seq for seq_id, seq in longest_sequences.values()}, aligned_file)
512
+
513
+ # Read aligned sequences and concatenate them
514
+ aligned_sequences = read_fasta(aligned_file)
515
+ # Find the length of the longest sequence in aligned_sequences
516
+ max_length = max(len(seq) for seq in aligned_sequences.values())
517
+
518
+ for genome, sequence in concatenated_sequences.items():
519
+ if any(genome in key for key in aligned_sequences.keys()):
520
+ genome_name_in_aligned = next(key for key in aligned_sequences.keys() if genome in key)#.split('|')[split_by]
521
+ concatenated_sequences[genome] += aligned_sequences[genome_name_in_aligned]
522
+ else:
523
+ concatenated_sequences[genome] += "-" * max_length
524
+
525
+ # Clean up aligned file
526
+ os.remove(aligned_file)
527
+
528
+ return concatenated_sequences
529
+
530
+ def process_gene_groups(options, group_directory, sub_group_directory, paralog_groups, genome_list, output_file):
373
531
  """Process each gene family file to select the longest sequence per genome and concatenate aligned sequences."""
374
- concatenated_sequences = {}
375
- output_file = directory.replace('Gene_Families_Output',output_file)
376
-
377
- # Iterate over each gene family file
378
- for gene_file in os.listdir(directory):
379
- if gene_file.endswith('.fasta') and not gene_file.endswith('combined_group_sequences.fasta'):
380
- gene_path = os.path.join(directory, gene_file)
381
-
382
- # Read sequences from the gene family file
383
- sequences = read_fasta(gene_path)
532
+ concatenated_sequences = {genome: "" for genome in genome_list}
533
+ output_file = group_directory.replace('Gene_Groups_Output', output_file)
534
+ if paralog_groups != None:
535
+ threshold_size = math.floor(int(options.align_core) * int(options.genome_num) / 100)
384
536
 
385
- # Select the longest sequence for each genome
386
- longest_sequences = select_longest_gene(sequences)
537
+ if options.align_aa == True:
538
+ affix = '_aa.fasta'
539
+ else:
540
+ affix = '_dna.fasta'
387
541
 
388
- # Run mafft on the longest sequences
389
- aligned_file = f"{directory}/{gene_file}_aligned.fasta.tmp"
390
- run_mafft_on_sequences(options, {seq_id: seq for seq_id, seq in longest_sequences.values()}, aligned_file)
542
+ # Iterate over each gene family file
543
+ for gene_file in os.listdir(group_directory):
544
+ if gene_file.endswith(affix) and not gene_file.startswith('combined_group_sequences'):
545
+ #print(gene_file)
546
+ current_group = int(gene_file.split('_')[3].split('.')[0])
547
+ gene_path = os.path.join(group_directory, gene_file)
548
+
549
+ # Check for matching group in paralog_groups
550
+ if sub_group_directory and paralog_groups and '>Group_'+str(current_group) in paralog_groups:
551
+ for subgroup, size in enumerate(paralog_groups['>Group_' + str(current_group)]['sizes']):
552
+ if size >= threshold_size:
553
+ gene_path = os.path.join(sub_group_directory,f"Group_{current_group}_subgroup_{subgroup}{affix}")
554
+ concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, True)
391
555
 
392
- # Read aligned sequences and concatenate them
393
- aligned_sequences = read_fasta(aligned_file)
394
- for genome, aligned_seq in aligned_sequences.items():
395
- genome_name = genome.split('|')[0]
396
- if genome_name not in concatenated_sequences:
397
- concatenated_sequences[genome_name] = ""
398
- concatenated_sequences[genome_name] += aligned_seq
556
+ else:
557
+ concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, False)
399
558
 
400
- # Clean up aligned file
401
- os.remove(aligned_file)
402
559
 
403
560
  # Write the concatenated sequences to the output file
404
561
  with open(output_file, 'w') as out:
405
562
  for genome, sequence in concatenated_sequences.items():
406
563
  out.write(f">{genome}\n")
407
564
  wrapped_sequence = wrap_sequence(sequence, 60)
408
- out.write(f"{wrapped_sequence}\n")
565
+ out.write(f"{wrapped_sequence}\n")
566
+