PyPI - PyamilySeq - Versions diffs - 1.0.1__py3-none-any.whl → 1.1.1__py3-none-any.whl - Mend

PyamilySeq 1.0.1py3-none-any.whl → 1.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

PyamilySeq/Cluster_Compare.py +108 -0
PyamilySeq/Cluster_Summary.py +59 -64
PyamilySeq/Group_Extractor.py +83 -0
PyamilySeq/Group_Sizes.py +87 -0
PyamilySeq/PyamilySeq.py +26 -18
PyamilySeq/PyamilySeq_Genus.py +3 -3
PyamilySeq/PyamilySeq_Species.py +10 -8
PyamilySeq/Seq_Combiner.py +25 -8
PyamilySeq/clusterings.py +0 -2
PyamilySeq/constants.py +1 -1
PyamilySeq/utils.py +197 -114
{PyamilySeq-1.0.1.dist-info → pyamilyseq-1.1.1.dist-info}/METADATA +46 -85
pyamilyseq-1.1.1.dist-info/RECORD +21 -0
{PyamilySeq-1.0.1.dist-info → pyamilyseq-1.1.1.dist-info}/WHEEL +1 -1
{PyamilySeq-1.0.1.dist-info → pyamilyseq-1.1.1.dist-info}/entry_points.txt +6 -0
PyamilySeq-1.0.1.dist-info/RECORD +0 -18
{PyamilySeq-1.0.1.dist-info → pyamilyseq-1.1.1.dist-info}/LICENSE +0 -0
{PyamilySeq-1.0.1.dist-info → pyamilyseq-1.1.1.dist-info}/top_level.txt +0 -0

PyamilySeq/Seq_Combiner.py CHANGED Viewed

@@ -22,14 +22,17 @@ def main():
                              ' "combined" for GFF files with embedded FASTA sequences and "fasta" for combining multiple '
                                'FASTA files together.',
                           required=True)
-    required.add_argument("-name_split", action="store", dest="name_split",
-                          help="substring used to split the filename and extract the genome name ('_combined.gff3' or '.gff').",
-                          required=True)
+    required.add_argument("-name_split_gff", action="store", dest="name_split_gff",
+                          help="Substring used to split the filename and extract the genome name ('_combined.gff3' or '.gff'). - Not needed with -input_type fasta",
+                          required=False)
+    required.add_argument("-name_split_fasta", action="store", dest="name_split_fasta",
+                          help="Substring used to split filenames and extract genome names for fasta files if named differently to paired gff files (e.g., '_dna.fasta').",
+                          required=False)
     required.add_argument("-output_dir", action="store", dest="output_dir",
                           help="Directory for all output files.",
                           required=True)
     required.add_argument("-output_name", action="store", dest="output_file",
-                          help="Output file name (without .fasta).",
+                          help="Output file name.",
                           required=True)
     optional = parser.add_argument_group('Optional Arguments')
@@ -48,19 +51,33 @@ def main():
     options = parser.parse_args()
+    if options.input_type == 'separate' and options.name_split_gff is None:
+        print("Please provide a substring to split the filename and extract the genome name.")
+        exit(1)
+    if options.input_type == 'combined' and options.name_split_gff is None:
+        print("Please provide a substring to split the filename and extract the genome name.")
+        exit(1)
+    if options.input_type == 'fasta' and options.name_split_fasta is None:
+        print("Please provide a substring to split the filename and extract the genome name.")
+        exit
     output_path = os.path.abspath(options.output_dir)
     if not os.path.exists(output_path):
         os.makedirs(output_path)
-    combined_out_file = os.path.join(output_path, options.output_file + '.fasta')
+    #output_file = options.output_file + '.fasta'
+    if os.path.exists(os.path.join(output_path, options.output_file)):
+        print(f"Output file {options.output_file} already exists in the output directory. Please delete or rename the file and try again.")
+        exit(1)
+    combined_out_file = os.path.join(output_path, options.output_file )
     if options.input_type == 'separate':
-        read_separate_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, options.translate)
+        read_separate_files(options.input_dir, options.name_split_gff, options.name_split_fasta, options.gene_ident, combined_out_file, options.translate, True)
     elif options.input_type == 'combined':
-        read_combined_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, options.translate)
+        read_combined_files(options.input_dir, options.name_split_gff, options.gene_ident, combined_out_file, options.translate, True)
     elif options.input_type == 'fasta':
-        read_fasta_files(options.input_dir, options.name_split, combined_out_file, options.translate)
+        read_fasta_files(options.input_dir, options.name_split_fasta, combined_out_file, options.translate)
 if __name__ == "__main__":
     main()

PyamilySeq/clusterings.py CHANGED Viewed

@@ -279,8 +279,6 @@ def combined_clustering_CDHIT(options, taxa_dict, splitter):
     first = True
     for line in Second_in:
         if line.startswith('>'):
-            if '>Cluster 1997' in line:
-                print()
             if first == False:
                 cluster_size = len(Combined_clusters[cluster_id])
                 Combined_reps.update({rep: cluster_size})

PyamilySeq/constants.py CHANGED Viewed

@@ -1,2 +1,2 @@
-PyamilySeq_Version = 'v1.0.1'
+PyamilySeq_Version = 'v1.1.1'

PyamilySeq/utils.py CHANGED Viewed

@@ -228,15 +228,39 @@ def run_mafft_on_sequences(options, sequences, output_file):
-def read_separate_files(input_dir, name_split, gene_ident, combined_out, translate):
-    with open(combined_out, 'w') as combined_out_file, open(combined_out.replace('_dna.fasta','_aa.fasta'), 'w') as combined_out_file_aa:
-        for gff_file in glob.glob(os.path.join(input_dir, '*' + name_split)):
-            genome_name = os.path.basename(gff_file).split(name_split)[0]
-            corresponding_fasta_file = os.path.splitext(gff_file)[0] + '.fa'
-            if not os.path.exists(corresponding_fasta_file):
-                continue
+def read_separate_files(input_dir, name_split_gff, name_split_fasta, gene_ident, combined_out, translate, run_as_combiner):
+    if run_as_combiner == True:
+        combined_out_file_aa = None
+    else:
+        combined_out_file_aa = combined_out.replace('_dna.fasta','_aa.fasta')
+    with open(combined_out, 'w') as combined_out_file, (open(combined_out_file_aa, 'w') if combined_out_file_aa else open(os.devnull, 'w')) as combined_out_file_aa:
+        paired_files_found = None
+    #with open(combined_out, 'w') as combined_out_file, open(combined_out.replace('_dna.fasta','_aa.fasta'), 'w') as combined_out_file_aa:
+        gff_files = glob.glob(os.path.join(input_dir, '*' + name_split_gff))
+        if not gff_files:
+            sys.exit("Error: No GFF files found.")
+        for gff_file in gff_files:
+            genome_name = os.path.basename(gff_file).split(name_split_gff)[0]
+            if name_split_fasta == None:
+                possible_extensions = ['.fa', '.fasta', '.fna']
+                corresponding_fasta_file = None
+                for ext in possible_extensions:
+                    temp_file = os.path.splitext(gff_file)[0] + ext
+                    if os.path.exists(temp_file):
+                        corresponding_fasta_file = temp_file
+                        break
+                if corresponding_fasta_file is None:
+                    print("Corresponding FASTA file for GFF file '" + gff_file + "' not found. Skipping. - Try using the -name_split_fasta option.")
+                    continue
+            else:
+                corresponding_fasta_file = os.path.join(input_dir, genome_name + name_split_fasta)
+                if not os.path.exists(corresponding_fasta_file):
+                    print("Corresponding FASTA file for GFF file '" + gff_file + "' not found. Skipping. - Try using the -name_split_fasta option.")
+                    continue
             gff_features = []
+            paired_files_found = True
             with open(gff_file, 'r') as file:
                 seen_seq_ids = collections.defaultdict(int)
                 lines = file.readlines()
@@ -244,6 +268,7 @@ def read_separate_files(input_dir, name_split, gene_ident, combined_out, transla
                     line_data = line.split('\t')
                     if len(line_data) == 9:
                         if any(gene_type in line_data[2] for gene_type in gene_ident):
+                            seq_id = line_data[8].split('ID=')[1].split(';')[0]
                             contig = line_data[0]
                             feature = line_data[2]
                             strand = line_data[6]
@@ -253,7 +278,6 @@ def read_separate_files(input_dir, name_split, gene_ident, combined_out, transla
                                 seen_seq_ids[seq_id] + 1
                             else:
                                 seen_seq_ids[seq_id] = 1
-                            seq_id = line_data[8].split('ID=')[1].split(';')[0]
                             gff_features.append((contig, start, end, strand, feature,  seq_id))
             fasta_dict = collections.defaultdict(str)
             with open(corresponding_fasta_file, 'r') as file:
@@ -281,21 +305,44 @@ def read_separate_files(input_dir, name_split, gene_ident, combined_out, transla
                             full_sequence = fasta_dict[contig][1]
                             seq = full_sequence[corrected_start:corrected_stop]
-                        if translate == True:
-                            seq_aa = translate_frame(seq)
-                            wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
-                            combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
-                        wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
-                        combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
+                        if run_as_combiner == True:
+                            if translate == True:
+                                seq_aa = translate_frame(seq)
+                                wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
+                                combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
+                            else:
+                                wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
+                                combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
+                        else:
+                            if translate == True:
+                                seq_aa = translate_frame(seq)
+                                wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
+                                combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
+                            wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
+                            combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
-    if translate == False:
+    if not paired_files_found:
+        sys.exit("Could not find matching GFF/FASTA files - Please check input directory and -name_split_gff and -name_split_fasta parameters.")
+    if translate == False or translate == None:
         #Clean up unused file
-        os.remove(combined_out_file_aa.name)
+        try: # Catches is combined_out_file_aa is None
+            if combined_out_file.name != combined_out_file_aa.name:
+                os.remove(combined_out_file_aa.name)
+        except AttributeError:
+            pass
-def read_combined_files(input_dir, name_split, gene_ident, combined_out, translate):
-    with open(combined_out, 'w') as combined_out_file, open(combined_out.replace('_dna.fasta','_aa.fasta'), 'w') as combined_out_file_aa:
-        for gff_file in glob.glob(os.path.join(input_dir, '*' + name_split)):
+def read_combined_files(input_dir, name_split, gene_ident, combined_out, translate, run_as_combiner):
+    if run_as_combiner == True:
+        combined_out_file_aa = None
+    else:
+        combined_out_file_aa = combined_out.replace('_dna.fasta','_aa.fasta')
+    #with open(combined_out, 'w') as combined_out_file, open(combined_out_file_aa, 'w') if combined_out_file_aa else open(os.devnull, 'w'):
+    with open(combined_out, 'w') as combined_out_file, (open(combined_out_file_aa, 'w') if combined_out_file_aa else open(os.devnull, 'w')) as combined_out_file_aa:
+        gff_files = glob.glob(os.path.join(input_dir, '*' + name_split))
+        if not gff_files:
+            sys.exit("Error: No GFF files found - check input directory and -name_split_gff parameter.")
+        for gff_file in gff_files:
             genome_name = os.path.basename(gff_file).split(name_split)[0]
             fasta_dict = collections.defaultdict(str)
             gff_features = []
@@ -331,7 +378,7 @@ def read_combined_files(input_dir, name_split, gene_ident, combined_out, transla
                 for contig, fasta in fasta_dict.items():
                     reverse_sequence = reverse_complement(fasta[0])
-                    fasta_dict[contig][1]=reverse_sequence
+                    fasta_dict[contig][1] = reverse_sequence
                 if fasta_dict and gff_features:
                     for contig, start, end, strand, feature, seq_id in gff_features:
@@ -345,23 +392,43 @@ def read_combined_files(input_dir, name_split, gene_ident, combined_out, transla
                                 full_sequence = fasta_dict[contig][1]
                                 seq = full_sequence[corrected_start:corrected_stop]
-                            if translate == True:
-                                seq_aa = translate_frame(seq)
-                                wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
-                                combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
-                            wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
-                            combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
-    if translate == False:
+                            if run_as_combiner == True:
+                                if translate == True:
+                                    seq_aa = translate_frame(seq)
+                                    wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
+                                    combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
+                                else:
+                                    wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
+                                    combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
+                            else:
+                                if translate == True:
+                                    seq_aa = translate_frame(seq)
+                                    wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
+                                    combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
+                                wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
+                                combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
+    if translate == False or translate == None:
         #Clean up unused file
-        os.remove(combined_out_file_aa.name)
+        try: # Catches is combined_out_file_aa is None
+            if combined_out_file.name != combined_out_file_aa.name:
+                os.remove(combined_out_file_aa.name)
+        except AttributeError:
+            pass
-def read_fasta_files(input_dir, name_split, combined_out, translate):
-    with open(combined_out, 'w') as combined_out_file, open(combined_out.replace('_dna.fasta','_aa.fasta'), 'w') as combined_out_file_aa:
-        for fasta_file in glob.glob(os.path.join(input_dir, '*' + name_split)):
-            genome_name = os.path.basename(fasta_file).split(name_split)[0]
+def read_fasta_files(input_dir, name_split_fasta, combined_out, translate, run_as_combiner):
+    if run_as_combiner == True:
+        combined_out_file_aa = None
+    else:
+        combined_out_file_aa = combined_out.replace('_dna.fasta','_aa.fasta')
+    with open(combined_out, 'w') as combined_out_file, (open(combined_out_file_aa, 'w') if combined_out_file_aa else open(os.devnull, 'w')) as combined_out_file_aa:
+        fasta_files = glob.glob(os.path.join(input_dir, '*' + name_split_fasta))
+        if not fasta_files:
+            sys.exit("Error: No GFF files found.")
+        for fasta_file in fasta_files:
+            genome_name = os.path.basename(fasta_file).split(name_split_fasta)[0]
             fasta_dict = collections.defaultdict(str)
             with open(fasta_file, 'r') as file:
                 lines = file.readlines()
@@ -372,16 +439,30 @@ def read_fasta_files(input_dir, name_split, combined_out, translate):
                     else:
                         fasta_dict[current_seq] +=line.strip()
                 for seq_id, seq in fasta_dict.items():
-                    if translate == True:
-                        seq_aa = translate_frame(seq)
-                        wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
-                        combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
-                    wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
-                    combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
-    if translate == False:
+                    if run_as_combiner == True:
+                        if translate == True:
+                            seq_aa = translate_frame(seq)
+                            wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
+                            combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
+                        else:
+                            wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
+                            combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
+                    else:
+                        if translate == True:
+                            seq_aa = translate_frame(seq)
+                            wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
+                            combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
+                        wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
+                        combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
+    if translate == False or translate == None:
         #Clean up unused file
-        os.remove(combined_out_file_aa)
+        try: # Catches is combined_out_file_aa is None
+            if combined_out_file.name != combined_out_file_aa.name:
+                os.remove(combined_out_file_aa.name)
+        except AttributeError:
+            pass
 def write_groups_func(options, output_dir, key_order, cores, sequences,
                  pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences):
@@ -401,63 +482,65 @@ def write_groups_func(options, output_dir, key_order, cores, sequences,
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
-    combined_fasta_filename = os.path.join(output_dir, "combined_group_sequences_dna.fasta")
-    # Open combined FASTA file for writing all sequences
-    with open(combined_fasta_filename, 'w') as combined_fasta, open(combined_fasta_filename.replace('_dna.fasta','_aa.fasta'), 'w') as combined_fasta_aa:
-        for key_prefix in key_order:
-            for key, values in cores.items():
-                if any(part in options.write_groups.split(',') for part in key.split('_')):
-                    if key.startswith(key_prefix):
-                        for value in values:
-                            output_filename = f"{key}_{value}_dna.fasta"
-                            if 'First' in key_prefix:
-                                sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
-                            else:
-                                sequences_to_write = combined_pangenome_clusters_Second_sequences[value]
-                            # Write individual FASTA file
-                            with open(os.path.join(output_dir,output_filename), 'w') as outfile, open(os.path.join(output_dir, output_filename.replace('_dna.fasta','_aa.fasta')), 'w') as outfile_aa:
-                                for header in sequences_to_write:
-                                    if header in sequences:
-                                        sequence = sequences[header]
-                                        wrapped_sequence = wrap_sequence(sequence)
-                                        # Handle Amino Acid Sequences (AA)
-                                        if options.sequence_type == 'AA':
-                                            seq_aa = translate_frame(sequence)
-                                            wrapped_sequence_aa = wrap_sequence(seq_aa)
-                                            # Write individual group file for AA, if option is enabled
+    for group in options.write_groups.split(','):
+        combined_fasta_filename = os.path.join(output_dir, "combined_group_sequences_" + group + "_dna.fasta")
+        # Open combined FASTA file for writing all sequences
+        with open(combined_fasta_filename, 'w') as combined_fasta, open(combined_fasta_filename.replace('_dna.fasta','_aa.fasta'), 'w') as combined_fasta_aa:
+            for key_prefix in key_order:
+                for key, values in cores.items():
+                    if any(part in group for part in key.split('_')):
+                        if key.startswith(key_prefix):
+                            for value in values:
+                                output_filename = f"{key}_{value}_dna.fasta"
+                                if 'First' in key_prefix:
+                                    sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
+                                else:
+                                    sequences_to_write = combined_pangenome_clusters_Second_sequences[value]
+                                # Write individual FASTA file
+                                with open(os.path.join(output_dir,output_filename), 'w') as outfile, open(os.path.join(output_dir, output_filename.replace('_dna.fasta','_aa.fasta')), 'w') as outfile_aa:
+                                    for header in sequences_to_write:
+                                        if header in sequences:
+                                            sequence = sequences[header]
+                                            wrapped_sequence = wrap_sequence(sequence)
+                                            # Handle Amino Acid Sequences (AA)
+                                            if options.sequence_type == 'AA':
+                                                seq_aa = translate_frame(sequence)
+                                                wrapped_sequence_aa = wrap_sequence(seq_aa)
+                                                # Write individual group file for AA, if option is enabled
+                                                if options.write_individual_groups:
+                                                    outfile_aa.write(f">{header}\n")
+                                                    outfile_aa.write(f"{wrapped_sequence_aa}\n")
+                                                else:
+                                                    os.remove(outfile_aa.name)  # Delete individual file if option is disabled
+                                                # Always write to the combined AA file
+                                                combined_fasta_aa.write(f">Group_{value}|{header}\n")
+                                                combined_fasta_aa.write(f"{wrapped_sequence_aa}\n")
+                                            # Handle Nucleotide Sequences
+                                            else:
+                                                # If the option is disabled, delete individual AA file (if created)
+                                                try:
+                                                    os.remove(outfile_aa.name)  # Ensure outfile_aa is removed when sequence_type isn't 'AA'
+                                                except FileNotFoundError:
+                                                    pass
+                                            # Write individual group file for nucleotide sequence, if option is enabled
                                             if options.write_individual_groups:
-                                                outfile_aa.write(f">{header}\n")
-                                                outfile_aa.write(f"{wrapped_sequence_aa}\n")
+                                                outfile.write(f">{header}\n")
+                                                outfile.write(f"{wrapped_sequence}\n")
                                             else:
-                                                os.remove(outfile_aa.name)  # Delete individual file if option is disabled
-                                            # Always write to the combined AA file
-                                            combined_fasta_aa.write(f">Group_{value}|{header}\n")
-                                            combined_fasta_aa.write(f"{wrapped_sequence_aa}\n")
-                                        # Handle Nucleotide Sequences
-                                        else:
-                                            # If the option is disabled, delete individual AA file (if created)
-                                            try:
-                                                os.remove(outfile_aa.name)  # Ensure outfile_aa is removed when sequence_type isn't 'AA'
-                                            except FileNotFoundError:
-                                                pass
-                                        # Write individual group file for nucleotide sequence, if option is enabled
-                                        if options.write_individual_groups:
-                                            outfile.write(f">{header}\n")
-                                            outfile.write(f"{wrapped_sequence}\n")
+                                                os.remove(outfile.name)  # Delete individual file if option is disabled
+                                            # Always write to the combined nucleotide file
+                                            combined_fasta.write(f">Group_{value}|{header}\n")
+                                            combined_fasta.write(f"{wrapped_sequence}\n")
                                         else:
-                                            os.remove(outfile.name)  # Delete individual file if option is disabled
-                                        # Always write to the combined nucleotide file
-                                        combined_fasta.write(f">Group_{value}|{header}\n")
-                                        combined_fasta.write(f"{wrapped_sequence}\n")
-                                    else:
-                                        if options.verbose == True:
-                                            print(f"Sequence {header} not found in original_fasta file.")
-    if options.sequence_type != 'AA':
-        #Clean up unused file
-        os.remove(combined_fasta_aa.name)
+                                            if options.verbose == True:
+                                                print(f"Sequence {header} not found in original_fasta file.")
+        if options.sequence_type != 'AA':
+            #Clean up unused file
+            os.remove(combined_fasta_aa.name)
     print(f"Combined FASTA file saved to: {combined_fasta_filename}")
@@ -502,7 +585,8 @@ def write_groups_func(options, output_dir, key_order, cores, sequences,
 def perform_alignment(gene_path,group_directory, gene_file, options, concatenated_sequences, subgrouped):
     # Read sequences from the gene family file
     sequences = read_fasta(gene_path)
+    if len(sequences) == 1: # We can't align a single sequence
+        return concatenated_sequences
     # Select the longest sequence for each genome
     longest_sequences = select_longest_gene(sequences, subgrouped)
@@ -539,23 +623,22 @@ def process_gene_groups(options, group_directory, sub_group_directory, paralog_g
     else:
         affix = '_dna.fasta'
-    # Iterate over each gene family file
-    for gene_file in os.listdir(group_directory):
-        if gene_file.endswith(affix) and not gene_file.startswith('combined_group_sequences'):
-            #print(gene_file)
-            current_group = int(gene_file.split('_')[3].split('.')[0])
-            gene_path = os.path.join(group_directory, gene_file)
-            # Check for matching group in paralog_groups
-            if sub_group_directory and paralog_groups and '>Group_'+str(current_group) in paralog_groups:
-                for subgroup, size in enumerate(paralog_groups['>Group_' + str(current_group)]['sizes']):
-                    if size >= threshold_size:
-                        gene_path = os.path.join(sub_group_directory,f"Group_{current_group}_subgroup_{subgroup}{affix}")
-                        concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, True)
-            else:
-                concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, False)
+    if options.align_core == True:
+        # Iterate over each gene family file
+        for gene_file in os.listdir(group_directory):
+            if gene_file.endswith(affix) and not gene_file.startswith('combined_group_sequences'):
+                current_group = int(gene_file.split('_')[3].split('.')[0])
+                gene_path = os.path.join(group_directory, gene_file)
+                # Could add more catches here to work with First and Secondary groups - This ensures only core '99/100' are aligned
+                if 'First_core_99' in gene_file or 'First_core_100' in gene_file:
+                    # Check for matching group in paralog_groups
+                    if sub_group_directory and paralog_groups and '>Group_'+str(current_group) in paralog_groups:
+                        for subgroup, size in enumerate(paralog_groups['>Group_' + str(current_group)]['sizes']):
+                            if size >= threshold_size:
+                                gene_path = os.path.join(sub_group_directory,f"Group_{current_group}_subgroup_{subgroup}{affix}")
+                                concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, True)
+                    else:
+                        concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, False)
     # Write the concatenated sequences to the output file
     with open(output_file, 'w') as out:

PyamilySeq 1.0.1__py3-none-any.whl → 1.1.1__py3-none-any.whl

PyamilySeq 1.0.1py3-none-any.whl → 1.1.1py3-none-any.whl