PyPI - PyamilySeq - Versions diffs - 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

PyamilySeq 0.6.0py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

PyamilySeq/Constants.py +1 -1
PyamilySeq/PyamilySeq.py +69 -38
PyamilySeq/PyamilySeq_Genus.py +85 -102
PyamilySeq/PyamilySeq_Species.py +101 -94
PyamilySeq/Seq_Combiner.py +26 -7
PyamilySeq/clusterings.py +111 -73
PyamilySeq/utils.py +117 -7
PyamilySeq-0.7.0.dist-info/METADATA +251 -0
PyamilySeq-0.7.0.dist-info/RECORD +14 -0
{PyamilySeq-0.6.0.dist-info → PyamilySeq-0.7.0.dist-info}/WHEEL +1 -1
PyamilySeq/CD-Hit_StORF-Reporter_Cross-Genera_Builder.py +0 -600
PyamilySeq-0.6.0.dist-info/METADATA +0 -147
PyamilySeq-0.6.0.dist-info/RECORD +0 -15
{PyamilySeq-0.6.0.dist-info → PyamilySeq-0.7.0.dist-info}/LICENSE +0 -0
{PyamilySeq-0.6.0.dist-info → PyamilySeq-0.7.0.dist-info}/entry_points.txt +0 -0
{PyamilySeq-0.6.0.dist-info → PyamilySeq-0.7.0.dist-info}/top_level.txt +0 -0

PyamilySeq/PyamilySeq_Species.py CHANGED Viewed

@@ -1,10 +1,6 @@
 #from line_profiler_pycharm import profile
-import copy
 import math
-import sys
 try:
     from .Constants import *
@@ -16,44 +12,6 @@ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
     from utils import *
-def process_gene_families(options, directory, output_file):
-    """Process each gene family file to select the longest sequence per genome and concatenate aligned sequences."""
-    concatenated_sequences = {}
-    output_file = directory.replace('Gene_Families_Output',output_file)
-    # Iterate over each gene family file
-    for gene_file in os.listdir(directory):
-        if gene_file.endswith('.fasta'):
-            gene_path = os.path.join(directory, gene_file)
-            # Read sequences from the gene family file
-            sequences = read_fasta(gene_path)
-            # Select the longest sequence for each genome
-            longest_sequences = select_longest_gene(sequences)
-            # Run mafft on the longest sequences
-            aligned_file = f"{gene_file}_aligned.fasta"
-            run_mafft_on_sequences(options, {seq_id: seq for seq_id, seq in longest_sequences.values()}, aligned_file)
-            # Read aligned sequences and concatenate them
-            aligned_sequences = read_fasta(aligned_file)
-            for genome, aligned_seq in aligned_sequences.items():
-                genome_name = genome.split('|')[0]
-                if genome_name not in concatenated_sequences:
-                    concatenated_sequences[genome_name] = ""
-                concatenated_sequences[genome_name] += aligned_seq
-            # Clean up aligned file
-            os.remove(aligned_file)
-    # Write the concatenated sequences to the output file
-    with open(output_file, 'w') as out:
-        for genome, sequence in concatenated_sequences.items():
-            out.write(f">{genome}\n")
-            wrapped_sequence = wrap_sequence(sequence, 60)
-            out.write(f"{wrapped_sequence}\n")
 def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
     print("Outputting gene_presence_absence file")
     output_dir = os.path.abspath(options.output_dir)
@@ -128,7 +86,7 @@ def get_cores(options,genome_dict):
 #@profile
 def calc_First_only_core(cluster, First_num, groups, cores):
     groups_as_list = list(groups.values())
-    for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= First_num <= fir):
+    for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= int(First_num) <= fir):
         res = idx
     family_group = list(groups)[res]
     cores['First_core_'+family_group].append(cluster)
@@ -138,6 +96,7 @@ def calc_single_First_extended_Second_only_core(cluster, First_num, groups, core
     groups_as_list = list(groups.values())
     for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= First_num+Second_num <= fir):
         res = idx
     family_group = list(groups)[res]
     cores['extended_core_' + family_group].append(cluster)
@@ -145,8 +104,11 @@ def calc_single_First_extended_Second_only_core(cluster, First_num, groups, core
 #@profile
 def calc_multi_First_extended_Second_only_core(cluster, First_num, groups, cores, Second_num): # Count seperately those gene families extended with StORF_Reporter but combined >1 PEP
     groups_as_list = list(groups.values())
-    for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= First_num+Second_num <= fir):
-        res = idx
+    # Looping through the list to find the matching condition
+    for idx, (sec, fir) in enumerate(groups_as_list):
+        if sec <= First_num + Second_num <= fir:
+            res = idx
+            break
     family_group = list(groups)[res]
     cores['combined_core_' + family_group].append(cluster)
@@ -173,9 +135,9 @@ def calc_only_Second_only_core(cluster, Second_num, groups, cores): # only count
 def cluster(options):
     if options.cluster_format == 'CD-HIT':
-        genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options, '|')
-    elif options.cluster_format in ['TSV','CSV']:
-        genome_dict, pangenome_clusters_First, pangenome_clusters_First_sequences, reps = cluster_EdgeList(options, '|')
+        genome_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_CDHIT(options, '|')
+    elif 'TSV' in options.cluster_format or 'CSV' in options.cluster_format:
+        genome_dict, pangenome_clusters_First, pangenome_clusters_First_genomes, pangenome_clusters_First_sequences, reps = cluster_EdgeList(options, '|')
     ###
     cores, groups = get_cores(options, genome_dict)
@@ -183,11 +145,11 @@ def cluster(options):
     if options.reclustered != None:
         if options.cluster_format == 'CD-HIT':
-            combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second = combined_clustering_CDHIT(options, genome_dict, '|')
-        if options.cluster_format == ['TSV','CSV']:
-            combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second = combined_clustering_Edge_List(options, '|')
-        pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered, '|')
+            combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences = combined_clustering_CDHIT(options, genome_dict, '|')
+        elif 'TSV' in options.cluster_format or 'CSV' in options.cluster_format:
+            #Fix
+            combined_pangenome_clusters_First_Second_clustered,not_Second_only_cluster_ids,combined_pangenome_clusters_Second,combined_pangenome_clusters_Second_sequences  = combined_clustering_Edge_List(options, '|')
+        pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered, pangenome_clusters_First_genomes, '|')
     else:
         pangenome_clusters_Type = single_clustering_counting(pangenome_clusters_First, reps)
@@ -201,19 +163,30 @@ def cluster(options):
     pangenome_clusters_Type_sorted = reorder_dict_by_keys(pangenome_clusters_Type, sorted_first_keys)
     print("Calculating Groups")
+    seen_groupings = []
     for cluster, numbers in pangenome_clusters_Type_sorted.items():
     ############################### Calculate First only
-        calc_First_only_core(cluster, numbers[1],groups,cores)
+        cluster = str(cluster)
+        if '78' in cluster:
+            pass
+        for grouping in numbers[2]: #!!# Could do with a more elegant solution
+            current_cluster = grouping[0].split(':')[0]
+            if current_cluster not in seen_groupings:
+                seen_groupings.append(current_cluster)
+                current_cluster_size = grouping[0].split(':')[1]
+                calc_First_only_core(current_cluster, current_cluster_size,groups,cores)
+            ############################# Calculate First and Reclustered-Second
+                if numbers[0] == 1 and numbers[3] >= 1:  # If Seconds did not combine First reps
+                    calc_single_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
+                elif numbers[0] > 1 and numbers[3] >= 1:  # If unique Seconds combined multiple Firsts
+                    calc_multi_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
+                elif numbers[4] >= 1:
+                    Number_Of_Second_Extending_But_Same_Genomes += 1
+            else:
+                if options.verbose == True:
+                    print("First cluster " + current_cluster + " already processed - This is likely because it was clustered with another First representative.")
     if options.reclustered != None:
-        ############################# Calculate First and Reclustered-Second
-        if numbers[0] == 1 and numbers[3] >= 1:  # If Seconds did not combine First reps
-            calc_single_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
-        elif numbers[0] > 1 and numbers[3] >= 1:  # If unique Secondss combined multiple Firsts
-            calc_multi_First_extended_Second_only_core(cluster, numbers[1], groups, cores, numbers[3])
-        elif numbers[4] >= 1:
-            Number_Of_Second_Extending_But_Same_Genomes += 1
         combined_pangenome_clusters_ONLY_Second_Type = defaultdict(list)
         combined_pangenome_clusters_Second_Type = defaultdict(list)
         for cluster, genomes in combined_pangenome_clusters_Second.items():
@@ -230,6 +203,8 @@ def cluster(options):
     ###########################
     ### Output
     output_path = os.path.abspath(options.output_dir)
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
     stats_out = os.path.join(output_path,'summary_statistics.txt')
     key_order = ['First_core_', 'extended_core_', 'combined_core_', 'Second_core_','only_Second_core_']
     with open(stats_out, 'w') as outfile:
@@ -240,40 +215,72 @@ def cluster(options):
                 if key.startswith(key_prefix):
                     print(f"{key}: {len(value)}")
                     outfile.write(f"{key}: {len(value)}\n")
-        print("Total Number of Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
-        outfile.write("Total Number of Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
+        print("Total Number of First Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
+        outfile.write("Total Number of First Gene Groups (Including Singletons): " + str(len(pangenome_clusters_First_sequences_sorted)))
+        if options.reclustered!= None:
+            print("Total Number of Second Gene Groups (Including Singletons): " + str(
+                len(combined_pangenome_clusters_Second_sequences)))
+            print("Total Number of First Gene Groups That Had Additional Second Sequences But Not New Genomes: " + str(
+                Number_Of_Second_Extending_But_Same_Genomes))
+            outfile.write("\nTotal Number of Second Gene Groups (Including Singletons): " + str(
+                len(combined_pangenome_clusters_Second_sequences)))
+            outfile.write("\nTotal Number of First Gene Groups That Had Additional Second Sequences But Not New Genomes: " + str(
+                Number_Of_Second_Extending_But_Same_Genomes))
+        #Report number of first and second clusters and do the ame for genus
     if options.gene_presence_absence_out != None:
         gene_presence_absence_output(options,genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted)
-    if options.write_families != None and options.fasta != None:
-        sequences = read_fasta(options.fasta)
-        output_dir = os.path.dirname(os.path.abspath(options.output_dir))
-        output_dir = os.path.join(output_dir, 'Gene_Families_Output')
-        # Create output directory if it doesn't exist
-        if not os.path.exists(output_dir):
-            os.makedirs(output_dir)
-        for key_prefix in key_order:
-            for key, values in cores.items():
-                if any(part in options.write_families.split(',') for part in key.split('_')):
-                    if key.startswith(key_prefix):
-                        for value in values:
-                            output_filename = f"{key}_{value}.fasta"
-                            sequences_to_write = pangenome_clusters_First_sequences_sorted[value]
-                            # Write sequences to output file that are in the sequences dictionary
-                            with open(os.path.join(output_dir, output_filename), 'w') as outfile:
-                                for header in sequences_to_write:
-                                    if header in sequences:
-                                        outfile.write(f">{header}\n")
-                                        wrapped_sequence = wrap_sequence(sequences[header])
-                                        outfile.write(f"{wrapped_sequence}\n")
-    if options.con_core != None and options.fasta != None and options.write_families != None:
-        process_gene_families(options, os.path.join(output_dir, 'Gene_Families_Output'), 'concatonated_genes_aligned.fasta')
+    ###Need to fix this below. If full/partial the ifs need to be different. If full we first need to output the gfs then align. if -wruite-groups not presented then it needs
+    # to be done for alignment full anyway...
+    if options.run_mode == 'Full':
+        if options.reclustered == None:
+            combined_pangenome_clusters_Second_sequences = None
+        if options.write_groups != None:
+            print("Outputting gene group FASTA files")
+            sequences = read_fasta(options.fasta)
+            #output_dir = os.path.dirname(os.path.abspath(options.output_dir))
+            output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
+            write_groups(options,output_dir, key_order, cores, sequences,
+                         pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
+            if options.align_core != None:
+                print("Processing gene group alignment")
+                process_gene_families(options, output_dir, 'concatenated_genes_aligned.fasta')
+    elif options.run_mode == 'Partial':
+        if options.reclustered == None:
+            combined_pangenome_clusters_Second_sequences = None
+        if options.write_groups != None and options.fasta != None:
+            print("Outputting gene group FASTA files")
+            sequences = read_fasta(options.fasta)
+            #output_dir = os.path.dirname(os.path.abspath(options.output_dir))
+            output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
+            write_groups(options,output_dir, key_order, cores, sequences,
+                         pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
+            if options.align_core != None:
+                print("Processing gene group alignment")
+                process_gene_families(options, output_dir, 'concatenated_genes_aligned.fasta')
+        #
+        # if options.align_core != None:
+        #     #output_dir = os.path.dirname(os.path.abspath(options.output_dir))
+        #     output_dir = os.path.join(options.output_dir, 'Gene_Families_Output')
+        #     if not os.path.exists(output_dir):
+        #         os.makedirs(output_dir)
+        #     process_gene_families(options, output_dir, 'concatenated_genes_aligned.fasta')
+    #
+    # elif options.run_mode == 'Partial':
+    #     if options.align_core != None and options.fasta != None and options.write_groups != None:
+    #         process_gene_families(options, os.path.join(output_dir, 'Gene_Families_Output'), 'concatenated_genes_aligned.fasta')
+    #
+    #
+    #
+    #
+    #

PyamilySeq/Seq_Combiner.py CHANGED Viewed

@@ -11,15 +11,16 @@ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
 def main():
-    parser = argparse.ArgumentParser(description='Seq-Combiner ' + PyamilySeq_Version + ': Seq-Combiner Run Parameters.')
+    parser = argparse.ArgumentParser(description='Seq-Combiner ' + PyamilySeq_Version + ': A tool to extract sequences from GFF/FASTA files.')
     ### Required Arguments
     required = parser.add_argument_group('Required Arguments')
     required.add_argument('-input_dir', action='store', dest='input_dir',
                           help='Directory location where the files are located.',
                           required=True)
-    required.add_argument("-input_type", action="store", dest="input_type", choices=['separate', 'combined'],
-                          help="Type of input files: 'separate' for separate FASTA and GFF files,"
-                             " 'combined' for GFF files with embedded FASTA sequences.",
+    required.add_argument('-input_type', action='store', dest='input_type', choices=['separate', 'combined', 'fasta'],
+                          help='Type of input files: "separate" for separate FASTA and GFF files,'
+                             ' "combined" for GFF files with embedded FASTA sequences and "fasta" for combining multiple '
+                               'FASTA files together.',
                           required=True)
     required.add_argument("-name_split", action="store", dest="name_split",
                           help="substring used to split the filename and extract the genome name ('_combined.gff3' or '.gff').",
@@ -30,15 +31,33 @@ def main():
     required.add_argument("-output_name", action="store", dest="output_file",
                           help="Output file name.",
                           required=True)
+    optional = parser.add_argument_group('Optional Arguments')
+    optional.add_argument('-gene_ident', action='store', dest='gene_ident', default='CDS',
+                          help='Default - "CDS": Identifier used for extraction of sequences such as "misc_RNA,gene,mRNA,CDS,rRNA,tRNA,tmRNA,CRISPR,ncRNA,regulatory_region,oriC,pseudo"'
+                               ' - Not compatible with "fasta" input mode.',
+                          required=False)
+    optional.add_argument('-translate', action='store_true', dest='translate', default=None,
+                          help='Default - False: Translate extracted sequences to their AA counterpart?',
+                          required=False)
+    misc = parser.add_argument_group('Misc Arguments')
+    misc.add_argument('-v', action='store_true', dest='version',
+                         help='Print out version number and exit',
+                         required=False)
     options = parser.parse_args()
+    if options.version:
+        sys.exit(PyamilySeq_Version)
     output_path = os.path.abspath(options.output_dir)
     combined_out_file = os.path.join(output_path, options.output_file)
     if options.input_type == 'separate':
-        read_separate_files(options.input_dir, options.name_split, )
-    else:
-        read_combined_files(options.input_dir, options.name_split, combined_out_file)
+        read_separate_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, options.translate)
+    elif options.input_type == 'combined':
+        read_combined_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, options.translate)
+    elif options.input_type == 'fasta':
+        read_fasta_files(options.input_dir, options.name_split, combined_out_file, options.translate)
 if __name__ == "__main__":
     main()

PyamilySeq 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

PyamilySeq 0.6.0py3-none-any.whl → 0.7.0py3-none-any.whl