PyPI - PyamilySeq - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

PyamilySeq 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

PyamilySeq/Constants.py +1 -14
PyamilySeq/PyamilySeq.py +169 -118
PyamilySeq/PyamilySeq_Genus.py +659 -0
PyamilySeq/PyamilySeq_Species.py +33 -115
PyamilySeq/Seq_Combiner.py +44 -0
PyamilySeq/utils.py +136 -0
PyamilySeq-0.5.0.dist-info/METADATA +163 -0
PyamilySeq-0.5.0.dist-info/RECORD +14 -0
{PyamilySeq-0.4.0.dist-info → PyamilySeq-0.5.0.dist-info}/WHEEL +1 -1
{PyamilySeq-0.4.0.dist-info → PyamilySeq-0.5.0.dist-info}/entry_points.txt +1 -0
PyamilySeq/combine_FASTA_with_genome_IDs.py +0 -49
PyamilySeq-0.4.0.dist-info/METADATA +0 -92
PyamilySeq-0.4.0.dist-info/RECORD +0 -12
{PyamilySeq-0.4.0.dist-info → PyamilySeq-0.5.0.dist-info}/LICENSE +0 -0
{PyamilySeq-0.4.0.dist-info → PyamilySeq-0.5.0.dist-info}/top_level.txt +0 -0

PyamilySeq/Constants.py CHANGED Viewed

@@ -1,15 +1,2 @@
-import subprocess
+PyamilySeq_Version = 'v0.5.0'
-PyamilySeq_Version = 'v0.4.0'
-def is_tool_installed(tool_name):
-    """Check if a tool is installed and available in PATH."""
-    try:
-        subprocess.run([tool_name, '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
-        return True
-    except subprocess.CalledProcessError:
-        return False
-    except FileNotFoundError:
-        return False

PyamilySeq/PyamilySeq.py CHANGED Viewed

@@ -9,64 +9,13 @@ from PyamilySeq_Species import *
 try:
     from .PyamilySeq_Species import cluster
     from .Constants import *
+    from .utils import *
 except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
     from PyamilySeq_Species import cluster
     from Constants import *
+    from utils import *
-def reverse_complement(seq):
-    complement = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'N': 'N'}
-    return ''.join(complement[base] for base in reversed(seq))
-def read_separate_files(input_dir, name_split, combined_out):
-    with open(combined_out, 'w') as combined_out_file:
-        for fasta_file in glob.glob(os.path.join(input_dir, '*' + name_split)):
-            genome_name = os.path.basename(fasta_file).split(name_split)[0]
-            corresponding_gff_file = fasta_file.replace('.fasta', '.gff')
-            if not os.path.exists(corresponding_gff_file):
-                continue
-            cds_sequences = extract_cds_from_gff(fasta_file, corresponding_gff_file)
-            for gene_name, seq in cds_sequences:
-                header = f">{genome_name}_{gene_name}\n"
-                combined_out_file.write(header)
-                combined_out_file.write(seq + '\n')
-def read_combined_files(input_dir, name_split, combined_out):
-    with open(combined_out, 'w') as combined_out_file:
-        for gff_file in glob.glob(os.path.join(input_dir, '*' + name_split)):
-            genome_name = os.path.basename(gff_file).split(name_split)[0]
-            fasta_dict = collections.defaultdict(str)
-            gff_features = []
-            with open(gff_file, 'r') as file:
-                lines = file.readlines()
-                fasta_section = False
-                for line in lines:
-                    if line.startswith('##FASTA'):
-                        fasta_section = True
-                        continue
-                    if fasta_section:
-                        if line.startswith('>'):
-                            current_contig = line[1:].split()[0]
-                            fasta_dict[current_contig] = []
-                        else:
-                            fasta_dict[current_contig].append(line.strip())
-                    else:
-                        line_data = line.split('\t')
-                        if len(line_data) == 9:
-                            if line_data[2] == 'CDS':
-                                contig = line_data[0]
-                                feature = line_data[2]
-                                start, end = int(line_data[3]), int(line_data[4])
-                                seq_id = line_data[8].split('ID=')[1].split(';')[0]
-                                gff_features.append((contig, start, end, seq_id))
-                if fasta_dict and gff_features:
-                    for contig, start, end, seq_id in gff_features:
-                        if contig in fasta_dict:
-                            full_sequence = ''.join(fasta_dict[contig])
-                            cds_sequence = full_sequence[start - 1:end]
-                            wrapped_sequence = '\n'.join([cds_sequence[i:i + 60] for i in range(0, len(cds_sequence), 60)])
-                            combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
 def run_cd_hit(input_file, clustering_output, options):
@@ -81,45 +30,66 @@ def run_cd_hit(input_file, clustering_output, options):
         '-sc', "1",
         '-sf', "1"
     ]
-    subprocess.run(cdhit_command)
+    if options.verbose == True:
+        subprocess.run(cdhit_command)
+    else:
+        subprocess.run(cdhit_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 def main():
-    parser = argparse.ArgumentParser(
-        description='PyamilySeq ' + PyamilySeq_Version + ': PyamilySeq Run Parameters.')
+    parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': PyamilySeq Run Parameters.')
+    ### Required Arguments
     required = parser.add_argument_group('Required Arguments')
-    required.add_argument("-id", action="store", dest="input_dir",
-                          help="Directory containing GFF/FASTA files.",
-                          required=True)
-    required.add_argument("-od", action="store", dest="output_dir",
-                          help="Directory for all output files.",
-                          required=True)
-    required.add_argument("-it", action="store", dest="input_type", choices=['separate', 'combined'],
-                        help="Type of input files: 'separate' for separate FASTA and GFF files,"
-                             " 'combined' for GFF files with embedded FASTA sequences.",
-                          required=True)
-    required.add_argument("-ns", action="store", dest="name_split",
-                          help="Character used to split the filename and extract the genome name.",
-                          required=True)
-    required.add_argument("-pid", action="store", dest="pident", type=float,
-                          help="Pident threshold for CD-HIT clustering.",
+    required.add_argument('-run_mode', action='store', dest='run_mode', choices=['Full','Partial'],
+                          help='Run Mode: Should PyamilySeq be run in "Full" or "Partial" mode?',
                           required=True)
-    required.add_argument("-ld", action="store", dest="len_diff", type=float,
-                          help="Length difference (-s) threshold for CD-HIT clustering.",
+    required.add_argument('-group_mode', action='store', dest='group_type', choices=['Species','Genus'],
+                          help='Group Mode: Should PyamilySeq be run in "Species" or "Genus" mode?',
                           required=True)
-    required.add_argument("-co", action="store", dest="clustering_out",
-                          help="Output file for initial clustering.",
+    required.add_argument("-clust_tool", action="store", dest="clust_tool", choices=['CD-HIT'],
+                          help="Clustering tool to use: CD-HIT, DIAMOND, BLAST or MMseqs2.",
                           required=True)
-    required.add_argument("-ct", action="store", dest="clustering_type", choices=['CD-HIT', 'BLAST', 'DIAMOND', "MMseqs2"],
-                        help="Clustering format for PyamilySeq.",
+    required.add_argument("-output_dir", action="store", dest="output_dir",
+                          help="Directory for all output files.",
                           required=True)
+    ### Full-Mode Arguments
+    full_mode_args = parser.add_argument_group('Full-Mode Arguments - Required when "-run_mode Full" is used')
+    full_mode_args.add_argument("-input_type", action="store", dest="input_type", choices=['separate', 'combined'],
+                          help="Type of input files: 'separate' for separate FASTA and GFF files,"
+                             " 'combined' for GFF files with embedded FASTA sequences.",
+                          required=False)
+    full_mode_args.add_argument("-input_dir", action="store", dest="input_dir",
+                          help="Directory containing GFF/FASTA files.",
+                          required=False)
+    full_mode_args.add_argument("-name_split", action="store", dest="name_split",
+                          help="substring used to split the filename and extract the genome name ('_combined.gff3' or '.gff').",
+                          required=False)
+    full_mode_args.add_argument("-pid", action="store", dest="pident", type=float, default=0.95,
+                          help="Default 0.95: Pident threshold for clustering.",
+                          required=False)
+    full_mode_args.add_argument("-len_diff", action="store", dest="len_diff", type=float, default=0.80,
+                          help="Default 0.80: Minimum length difference between clustered sequences - (-s) threshold for CD-HIT clustering.",
+                          required=False)
+    ###Partial-Mode Arguments
+    partial_mode_args = parser.add_argument_group('Partial-Mode Arguments - Required when "-run_mode Partial" is used')
+    partial_mode_args.add_argument('-cluster_file', action='store', dest='cluster_file',
+                        help='Clustering output file containing CD-HIT, TSV or CSV Edge List',
+                        required=False)
+    ###Grouping Arguments
+    grouping_args = parser.add_argument_group('Grouping Arguments - Use to fine-tune grouping of genes after clustering')
+    grouping_args.add_argument('-reclustered', action='store', dest='reclustered', help='Clustering output file from secondary round of clustering',
+                        required=False)
+    grouping_args.add_argument('-seq_tag', action='store', dest='sequence_tag', default='StORF',
+                        help='Default - "StORF": Unique identifier to be used to distinguish the second of two rounds of clustered sequences',
+                        required=False)
+    grouping_args.add_argument('-groups', action="store", dest='core_groups', default="99,95,15",
+                        help='Default - (\'99,95,15\'): Gene family groups to use',
+                        required=False)
+    ###Output Arguments
     output_args = parser.add_argument_group('Output Parameters')
     output_args.add_argument('-w', action="store", dest='write_families', default=None,
                           help='Default - No output: Output sequences of identified families (provide levels at which to output "-w 99,95"'
@@ -129,58 +99,139 @@ def main():
                           help='Default - No output: Output aligned and concatinated sequences of identified families - used for MSA (provide levels at which to output "-w 99,95"'
                                ' - Must provide FASTA file with -fasta',
                           required=False)
-    output_args.add_argument('-fasta', action='store', dest='fasta',
-                          help='FASTA file to use in conjunction with "-w" or "-con"',
+    output_args.add_argument('-original_fasta', action='store', dest='original_fasta',
+                          help='FASTA file to use in conjunction with "-w" or "-con" when running in Partial Mode.',
                           required=False)
-    optional = parser.add_argument_group('Optional Arguments')
-    optional.add_argument('-rc', action='store', dest='reclustered', help='Clustering output file from secondary round of clustering',
-                        required=False)
-    optional.add_argument('-st', action='store', dest='sequence_tag', help='Default - "StORF": Unique identifier to be used to distinguish the second of two rounds of clustered sequences',
+    output_args.add_argument('-gpa', action='store', dest='gene_presence_absence_out', help='Default - False: If selected, a Roary formatted gene_presence_absence.csv will be created - Required for Coinfinder and other downstream tools',
                         required=False)
-    optional.add_argument('-groups', action="store", dest='core_groups', default="99,95,15",
-                        help='Default - (\'99,95,15\'): Gene family groups to use')
-    optional.add_argument('-gpa', action='store', dest='gene_presence_absence_out', help='Default - False: If selected, a Roary formatted gene_presence_absence.csv will be created - Required for Coinfinder and other downstream tools',
+    ### Misc Arguments
+    misc = parser.add_argument_group('Misc')
+    misc.add_argument('-verbose', action='store', dest='verbose', default=False, type=eval, choices=[True, False],
+                        help='Default - False: Print out runtime messages',
+                        required = False)
+    misc.add_argument('-v', action='store_true', dest='version',
+                        help='Default - False: Print out version number and exit',
                         required=False)
-    parser.add_argument("pyamilyseq_args", nargs=argparse.REMAINDER, help="Additional arguments for PyamilySeq.")
     options = parser.parse_args()
+    ### Checking all required parameters are provided by user
+    if options.run_mode == 'Full':
+        required_full_mode = [options.input_type, options.input_dir, options.name_split, options.clust_tool,
+                              options.pident, options.len_diff]
+        if all(required_full_mode):
+            # Proceed with the Full mode
+            pass
+        else:
+            missing_options = [opt for opt in
+                               ['input_type', 'input_dir', 'name_split', 'clust_tool', 'pident', 'len_diff'] if
+                               not options.__dict__[opt]]
+            print(f"Missing required options for Full mode: {', '.join(missing_options)}")
+    elif options.run_mode == 'Partial':
+        required_partial_mode = [options.cluster_file, ]
+        if all(required_partial_mode):
+            # Proceed with the Partial mode
+            pass
+        else:
+            missing_options = [opt for opt in
+                               ['cluster_file',] if
+                               not options.__dict__[opt]]
+            print(f"Missing required options for Partial mode: {', '.join(missing_options)}")
+    if options.clust_tool == 'CD-HIT':
+        clust_affix = '.clstr'
+    elif options.clust_tool == 'TSV':
+        clust_affix = '.tsv'
+    elif options.clust_tool == 'CSV':
+        clust_affix = '.csv'
+    ###External tool checks:
+    ##MAFFT
+    if options.con_core == True:
+        if is_tool_installed('mafft'):
+            if options.verbose == True:
+                print("mafft is installed. Proceeding with alignment.")
+        else:
+            exit("mafft is not installed. Please install mafft to proceed.")
+    ##CD-HIT
+    if options.clust_tool == 'CD-HIT':
+        if is_tool_installed('cd-hit'):
+            if options.verbose == True:
+                print("cd-hit is installed. Proceeding with clustering.")
+        else:
+            exit("cd-hit is not installed. Please install cd-hit to proceed.")
+    if options.write_families != None and options.original_fasta == False:
+        exit("-fasta must br provided if -w is used")
+    options.core_groups = options.core_groups + ',0'
+    if options.cluster_file:
+        options.cluster_file = fix_path(options.cluster_file)
+    if options.reclustered:
+        options.reclustered = fix_path(options.reclustered)
+    if options.input_dir:
+        options.input_dir = fix_path(options.input_dir)
+    if options.output_dir:
+        options.output_dir = fix_path(options.output_dir)
     output_path = os.path.abspath(options.output_dir)
-    combined_out_file = os.path.join(output_path,"end_to_end_combined_sequences.fasta")
-    clustering_output = os.path.join(output_path,'clustering_'+options.clustering_type)
+    combined_out_file = os.path.join(output_path, "combined_sequences.fasta")
+    clustering_output = os.path.join(output_path, 'clustering_' + options.clust_tool)
+    if options.run_mode == 'Full':
-    # Step 1: Read and rename sequences from files based on input type
-    if options.input_type == 'separate':
-        read_separate_files(options.input_dir, options.name_split, combined_out_file)
-    else:
-        read_combined_files(options.input_dir, options.name_split, combined_out_file)
-    # Step 2: Run CD-HIT on the renamed sequences
-    run_cd_hit(combined_out_file, clustering_output, options)
+        if options.input_type == 'separate':
+            read_separate_files(options.input_dir, options.name_split, combined_out_file)
+        else:
+            read_combined_files(options.input_dir, options.name_split, combined_out_file)
+        run_cd_hit(combined_out_file, clustering_output, options)
+        class clustering_options:
+            def __init__(self):
+                self.cluster_format = options.clust_tool
+                self.reclustered = options.reclustered
+                self.sequence_tag = options.sequence_tag
+                self.core_groups = '99,95,15,0'
+                self.clusters = clustering_output + clust_affix
+                self.gene_presence_absence_out = options.gene_presence_absence_out
+                self.write_families = options.write_families
+                self.con_core = options.con_core
+                self.fasta = combined_out_file
+                self.verbose = options.verbose
+        clustering_options = clustering_options()
+    elif options.run_mode == 'Partial':
+        class clustering_options:
+            def __init__(self):
+                self.cluster_format = options.clust_tool
+                self.reclustered = options.reclustered
+                self.sequence_tag = options.sequence_tag
+                self.core_groups = '99,95,15,0'
+                self.clusters = options.cluster_file
+                self.gene_presence_absence_out = options.gene_presence_absence_out
+                self.write_families = options.write_families
+                self.con_core = options.con_core
+                self.fasta = options.original_fasta
+                self.verbose = options.verbose
+        clustering_options = clustering_options()
-    class clustering_options:
-        def __init__(self):
-            self.format = 'CD-HIT'
-            self.reclustered = options.reclustered
-            self.sequence_tag = 'StORF'
-            self.core_groups = '99,95,15,0'
-            self.clusters = clustering_output+'.clstr'
-            self.gene_presence_absence_out = options.gene_presence_absence_out
-            self.write_families = options.write_families
-            self.con_core = options.con_core
-    clustering_options = clustering_options()
-    # Step 3: Run PyamilySeq with the CD-HIT output
     cluster(clustering_options)
-    #run_pyamilyseq(options.clustering_out, options.clustering_type, combined_out_file, options.pyamilyseq_args)
+    print("Thank you for using PyamilySeq -- A detailed user manual can be found at https://github.com/NickJD/PyamilySeq\n"
+          "Please report any issues to: https://github.com/NickJD/PyamilySeq/issues\n#####")
 if __name__ == "__main__":
     main()

PyamilySeq 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

PyamilySeq 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl