PyPI - PyamilySeq - Versions diffs - 1.0.1__py3-none-any.whl → 1.1.1__py3-none-any.whl - Mend

PyamilySeq 1.0.1py3-none-any.whl → 1.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

PyamilySeq/Cluster_Compare.py +108 -0
PyamilySeq/Cluster_Summary.py +59 -64
PyamilySeq/Group_Extractor.py +83 -0
PyamilySeq/Group_Sizes.py +87 -0
PyamilySeq/PyamilySeq.py +26 -18
PyamilySeq/PyamilySeq_Genus.py +3 -3
PyamilySeq/PyamilySeq_Species.py +10 -8
PyamilySeq/Seq_Combiner.py +25 -8
PyamilySeq/clusterings.py +0 -2
PyamilySeq/constants.py +1 -1
PyamilySeq/utils.py +197 -114
{PyamilySeq-1.0.1.dist-info → pyamilyseq-1.1.1.dist-info}/METADATA +46 -85
pyamilyseq-1.1.1.dist-info/RECORD +21 -0
{PyamilySeq-1.0.1.dist-info → pyamilyseq-1.1.1.dist-info}/WHEEL +1 -1
{PyamilySeq-1.0.1.dist-info → pyamilyseq-1.1.1.dist-info}/entry_points.txt +6 -0
PyamilySeq-1.0.1.dist-info/RECORD +0 -18
{PyamilySeq-1.0.1.dist-info → pyamilyseq-1.1.1.dist-info}/LICENSE +0 -0
{PyamilySeq-1.0.1.dist-info → pyamilyseq-1.1.1.dist-info}/top_level.txt +0 -0

PyamilySeq/Cluster_Compare.py ADDED Viewed

@@ -0,0 +1,108 @@
+import argparse
+from collections import defaultdict
+def read_cd_hit_output(clstr_file):
+    """
+    Reads a CD-HIT .clstr file and extracts sequence clusters.
+    Returns a dictionary where keys are sequence headers and values are cluster IDs.
+    """
+    seq_to_cluster = {}  # Maps sequence header -> cluster ID
+    cluster_id = 0  # Generic ID for clusters (since CD-HIT names don't matter)
+    with open(clstr_file, 'r') as f:
+        for line in f:
+            line = line.strip()
+            if line.startswith(">Cluster"):
+                cluster_id += 1  # Increment cluster ID
+            elif line:
+                parts = line.split('\t')
+                if len(parts) > 1:
+                    seq_header = parts[1].split('>')[1].split('...')[0]  # Extract sequence header
+                    seq_to_cluster[seq_header] = cluster_id
+    return seq_to_cluster
+def compare_cd_hit_clusters(file1, file2, output_file):
+    """
+    Compares two CD-HIT .clstr files to check if clusters are the same.
+    Writes the results to a TSV file.
+    """
+    # Read both clustering files
+    clusters1 = read_cd_hit_output(file1)
+    clusters2 = read_cd_hit_output(file2)
+    # Reverse mappings: cluster ID -> list of sequences
+    grouped_clusters1 = defaultdict(set)
+    grouped_clusters2 = defaultdict(set)
+    for seq, cluster_id in clusters1.items():
+        grouped_clusters1[cluster_id].add(seq)
+    for seq, cluster_id in clusters2.items():
+        grouped_clusters2[cluster_id].add(seq)
+    # Initialize metrics counters
+    cluster_name_changes = 0
+    sequence_shifts = 0
+    only_in_file1 = defaultdict(list)
+    only_in_file2 = defaultdict(list)
+    cluster_mismatches = defaultdict(list)
+    # Prepare data for the TSV output
+    tsv_data = []
+    # Track changes
+    for seq, cluster_id in clusters1.items():
+        if seq not in clusters2:
+            only_in_file1[cluster_id].append(seq)
+            tsv_data.append([seq, cluster_id, "NA", "Only in file1"])
+        elif clusters2[seq] != cluster_id:
+            # Sequence shifts: sequence in different clusters between files
+            sequence_shifts += 1
+            cluster_mismatches[seq].append((cluster_id, clusters2[seq]))
+            tsv_data.append([seq, cluster_id, clusters2[seq], "Mismatch"])
+    for seq, cluster_id in clusters2.items():
+        if seq not in clusters1:
+            only_in_file2[cluster_id].append(seq)
+            tsv_data.append([seq, "NA", cluster_id, "Only in file2"])
+        elif clusters1[seq] != cluster_id:
+            # Sequence shifts: sequence in different clusters between files
+            sequence_shifts += 1
+            cluster_mismatches[seq].append((clusters1[seq], cluster_id))
+            tsv_data.append([seq, clusters1[seq], cluster_id, "Mismatch"])
+    # Track cluster name changes (same sequences in different clusters)
+    for cluster_id1, seqs1 in grouped_clusters1.items():
+        for cluster_id2, seqs2 in grouped_clusters2.items():
+            if seqs1 == seqs2 and cluster_id1 != cluster_id2:
+                cluster_name_changes += 1
+                for seq in seqs1:
+                    tsv_data.append([seq, cluster_id1, cluster_id2, "Cluster name change"])
+    # Print metrics
+    print("🔢 Clustering Comparison Metrics:")
+    print(f"Cluster name changes: {cluster_name_changes}")
+    print(f"Sequence shifts (sequences assigned to different clusters): {sequence_shifts}")
+    print(f"Sequences only in the first file: {len(only_in_file1)}")
+    print(f"Sequences only in the second file: {len(only_in_file2)}")
+    print()
+    # Write the results to a TSV file
+    with open(output_file, 'w') as out_file:
+        out_file.write("Sequence\tCluster ID (File 1)\tCluster ID (File 2)\tChange Type\n")
+        for row in tsv_data:
+            out_file.write("\t".join(map(str, row)) + "\n")
+    print(f"✅ Results have been written to {output_file}")
+def main():
+    parser = argparse.ArgumentParser(description="Compare two CD-HIT .clstr files to check for clustering consistency.")
+    parser.add_argument("-file1", required=True, help="First CD-HIT .clstr file")
+    parser.add_argument("-file2", required=True, help="Second CD-HIT .clstr file")
+    parser.add_argument("-output", required=True, help="Output file (TSV format)")
+    args = parser.parse_args()
+    compare_cd_hit_clusters(args.file1, args.file2, args.output)
+if __name__ == "__main__":
+    main()

PyamilySeq/Cluster_Summary.py CHANGED Viewed

@@ -1,33 +1,32 @@
 import argparse
-from collections import OrderedDict
-from collections import defaultdict
+from collections import OrderedDict, defaultdict
 try:
     from .constants import *
     from .utils import *
-except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
+except (ModuleNotFoundError, ImportError, NameError, TypeError):
     from constants import *
     from utils import *
 def categorise_percentage(percent):
     """Categorise the percentage of genomes with multicopy genes."""
-    if 20 <= percent < 40:
-        return "20-40%"
-    elif 40 <= percent < 60:
-        return "40-60%"
-    elif 60 <= percent < 80:
-        return "60-80%"
-    elif 80 <= percent < 95:
-        return "80-95%"
-    elif 95 <= percent < 99:
-        return "95-99%"
-    elif 99 <= percent <= 100:
-        return "99-100%"
+    categories = {
+        (20, 40): "20-40%",
+        (40, 60): "40-60%",
+        (60, 80): "60-80%",
+        (80, 95): "80-95%",
+        (95, 99): "95-99%",
+        (99, 100): "99-100%"
+    }
+    for (low, high), label in categories.items():
+        if low <= percent < high:
+            return label
     return None
-# Read cd-hit .clstr file and extract information
 def read_cd_hit_output(clustering_output):
+    """Parse CD-HIT .cluster file and extract clustering information."""
     clusters = OrderedDict()
     with open(clustering_output, 'r') as f:
@@ -42,10 +41,8 @@ def read_cd_hit_output(clustering_output):
                 parts = line.split('\t')
                 if len(parts) > 1:
                     clustered_info = parts[1]
-                    length = clustered_info.split(',')[0]
-                    length = int(''.join(c for c in length if c.isdigit()))
-                    clustered_header = clustered_info.split('>')[1].split('...')[0]
-                    clustered_header = '>' + clustered_header
+                    length = int(''.join(c for c in clustered_info.split(',')[0] if c.isdigit()))
+                    clustered_header = '>' + clustered_info.split('>')[1].split('...')[0]
                     if 'at ' in clustered_info and '%' in clustered_info.split('at ')[-1]:
                         percent_identity = extract_identity(clustered_info)
@@ -63,12 +60,14 @@ def read_cd_hit_output(clustering_output):
     return clusters
-# Summarise the information for each cluster
-def summarise_clusters(options,clusters, output):
-    multicopy_groups = defaultdict(int)  # Counter for groups with multicopy genes
+def summarise_clusters(options, clusters, output):
+    """Generate a detailed cluster summary report."""
+    multicopy_groups = defaultdict(int)  # Counter for clusters with multicopy genes
     with open(output, 'w') as out_f:
-        out_f.write("Cluster_ID\tNum_Sequences\tAvg_Length\tLength_Range\tAvg_Identity\tIdentity_Range\n")
+        out_f.write(
+            "Cluster_ID\tNum_Sequences\tNum_Genomes\tAvg_Length\tLength_Range\tAvg_Identity\tIdentity_Range\tGenomes_With_Multiple_Genes\tMulticopy_Percentage\n"
+        )
         for cluster_id, seqs in clusters.items():
             num_seqs = len(seqs)
@@ -81,82 +80,78 @@ def summarise_clusters(options,clusters, output):
             avg_identity = sum(identities) / num_seqs if num_seqs > 0 else 0
             identity_range = f"{min(identities):.2f}-{max(identities):.2f}" if num_seqs > 0 else "N/A"
-            out_f.write(
-                f"{cluster_id}\t{num_seqs}\t{avg_length:.2f}\t{length_range}\t{avg_identity:.2f}\t{identity_range}\n")
-            # Count genomes with more than one gene
+            # Count genomes in cluster
             genome_to_gene_count = defaultdict(int)
             for seq in seqs:
-                genome = seq['header'].split('|')[0].replace('>','')
+                genome = seq['header'].split('|')[0].replace('>', '')
                 genome_to_gene_count[genome] += 1
+            num_genomes = len(genome_to_gene_count)
             num_genomes_with_multiple_genes = sum(1 for count in genome_to_gene_count.values() if count > 1)
+            multicopy_percentage = (num_genomes_with_multiple_genes / options.genome_num) * 100 if options.genome_num > 0 else 0
-            # Calculate the percentage of genomes with multicopy genes
-            multicopy_percentage = (num_genomes_with_multiple_genes / options.genome_num) * 100
+            # Categorize multicopy percentage
             category = categorise_percentage(multicopy_percentage)
             if category:
                 multicopy_groups[category] += 1
-        # Define the order of categories for printout
-        category_order = ["20-40%", "40-60%", "60-80%", "80-95%", "95-99%", "99-100%"]
+            # Write detailed output for each cluster
+            out_f.write(
+                f"{cluster_id}\t{num_seqs}\t{num_genomes}\t{avg_length:.2f}\t{length_range}\t{avg_identity:.2f}\t{identity_range}\t"
+                f"{num_genomes_with_multiple_genes}\t{multicopy_percentage:.2f}\n"
+            )
-        # Print the number of clusters with multicopy genes in each percentage range, in the correct order
+        # Define order for multicopy statistics output
+        category_order = ["20-40%", "40-60%", "60-80%", "80-95%", "95-99%", "99-100%"]
         for category in category_order:
-            print(f"Number of clusters with multicopy genes in {category} range: {multicopy_groups[category]}")
+            print(f"Clusters with multicopy genes in {category} range: {multicopy_groups[category]}")
-# Main function to parse arguments and run the analysis
 def main():
-    parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': Cluster-Summary - A tool to summarise CD-HIT clustering files.')
-    ### Required Arguments
-    required = parser.add_argument_group('Required Parameters')
-    required.add_argument('-input_clstr', action="store", dest="input_clstr",
-                          help='Input CD-HIT .clstr file',
-                          required=True)
-    required.add_argument('-output', action="store", dest="output",
-                          help="Output TSV file to store cluster summaries - Will add '.tsv' if not provided by user",
-                          required=True)
-    required.add_argument('-genome_num', action='store', dest='genome_num', type=int,
-                          help='The total number of genomes must be provide',
-                          required=True)
-    #required.add_argument("-clustering_format", action="store", dest="clustering_format", choices=['CD-HIT','TSV','CSV'],
-    #                      help="Clustering format to use: CD-HIT or TSV (MMseqs2, BLAST, DIAMOND) / CSV edge-list file (Node1\tNode2).",
-    #                      required=True)
+    """Main function to parse arguments and process clustering files."""
+    parser = argparse.ArgumentParser(
+        description='PyamilySeq ' + PyamilySeq_Version + ': Cluster-Summary - A tool to summarise CD-HIT clustering files.')
+    # Required Arguments
+    required = parser.add_argument_group('Required Parameters')
+    required.add_argument('-input_cluster', action="store", dest="input_cluster", required=True,
+                          help='Input CD-HIT .cluster file')
+    required.add_argument('-output', action="store", dest="output", required=True,
+                          help="Output TSV file to store cluster summaries - Will add '.tsv' if not provided by user")
+    required.add_argument('-genome_num', action='store', dest='genome_num', type=int, required=True,
+                          help='Total number of genomes in dataset')
+    # Optional Arguments
     optional = parser.add_argument_group('Optional Arguments')
     optional.add_argument('-output_dir', action="store", dest="output_dir",
-                          help='Default: Same as input file',
-                          required=False)
+                          help='Default: Same as input file', required=False)
     misc = parser.add_argument_group("Misc Parameters")
     misc.add_argument("-verbose", action="store_true", dest="verbose",
-                      help="Print verbose output.",
-                      required=False)
+                      help="Print verbose output.", required=False)
     misc.add_argument("-v", "--version", action="version",
                       version=f"PyamilySeq: Group-Summary version {PyamilySeq_Version} - Exiting",
                       help="Print out version number and exit")
     options = parser.parse_args()
-    print("Running PyamilySeq " + PyamilySeq_Version+ ": Group-Summary ")
+    print("Running PyamilySeq " + PyamilySeq_Version + ": Group-Summary ")
-    ### File handling
-    options.input_clstr = fix_path(options.input_clstr)
+    # File handling
+    options.input_cluster = fix_path(options.input_cluster)
     if options.output_dir is None:
-        options.output_dir = os.path.dirname(os.path.abspath(options.input_clstr))
+        options.output_dir = os.path.dirname(os.path.abspath(options.input_cluster))
     output_path = os.path.abspath(options.output_dir)
     if not os.path.exists(output_path):
         os.makedirs(output_path)
     output_name = options.output
     if not output_name.endswith('.tsv'):
         output_name += '.tsv'
     output_file_path = os.path.join(output_path, output_name)
-    ###
-    clusters = read_cd_hit_output(options.input_clstr)
-    summarise_clusters(options,clusters, output_file_path)
+    # Process clusters and generate summary
+    clusters = read_cd_hit_output(options.input_cluster)
+    summarise_clusters(options, clusters, output_file_path)
 if __name__ == "__main__":

PyamilySeq/Group_Extractor.py ADDED Viewed

@@ -0,0 +1,83 @@
+import argparse
+import os
+import csv
+def parse_fasta(fasta_file):
+    """
+    Parses a FASTA file and returns a dictionary of gene IDs and sequences.
+    """
+    sequences = {}
+    with open(fasta_file, 'r') as f:
+        gene_id = None
+        sequence = []
+        for line in f:
+            line = line.strip()
+            if line.startswith(">"):
+                if gene_id:  # Save the previous gene
+                    sequences[gene_id] = ''.join(sequence)
+                gene_id = line[1:].split()[0].split('|')[1].replace('ENSB_','')  # Extract the gene ID after ">"
+                sequence = []
+            else:
+                sequence.append(line)
+        if gene_id:  # Save the last gene
+            sequences[gene_id] = ''.join(sequence)
+    return sequences
+def parse_csv(csv_file):
+    """
+    Parses a CSV file to extract group IDs and gene IDs (skipping the first line).
+    """
+    groups = {}
+    with open(csv_file, 'r') as f:
+        reader = csv.reader(f, delimiter=',')  # Assuming tab-delimited CSV
+        next(reader)  # Skip the first line
+        for row in reader:
+            group_id = row[0]
+            gene_ids = row[14:]  # Read from column 14 onward
+            gene_ids = [gene.strip() for genes in gene_ids for gene in genes.split(';') if
+                        gene.strip()]  # Flatten and clean
+            groups[group_id] = gene_ids
+    return groups
+def write_group_fastas(groups, sequences, output_dir):
+    """
+    Writes individual FASTA files for each group with the relevant sequences.
+    """
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    for group_id, gene_ids in groups.items():
+        group_file = os.path.join(output_dir, f"{group_id}.fasta")
+        with open(group_file, 'w') as f:
+            for gene_id in gene_ids:
+                if gene_id in sequences:
+                    f.write(f">{gene_id}\n{sequences[gene_id]}\n")
+                else:
+                    print(f"Warning: Gene ID {gene_id} not found in FASTA file.")
+def main():
+    parser = argparse.ArgumentParser(description="Process FASTA and CSV files to create grouped FASTA outputs.")
+    parser.add_argument("-fasta", required=True, help="Input FASTA file containing gene sequences.")
+    parser.add_argument("-csv", required=True, help="Input CSV file containing group and gene information.")
+    parser.add_argument("-output_dir", required=True, help="Directory to save the grouped FASTA files.")
+    args = parser.parse_args()
+    # Parse the input files
+    print("Parsing FASTA file...")
+    sequences = parse_fasta(args.fasta)
+    print("Parsing CSV file...")
+    groups = parse_csv(args.csv)
+    # Write the grouped FASTA files
+    print("Writing grouped FASTA files...")
+    write_group_fastas(groups, sequences, args.output_dir)
+    print("Process completed successfully.")
+if __name__ == "__main__":
+    main()

PyamilySeq/Group_Sizes.py ADDED Viewed

@@ -0,0 +1,87 @@
+import argparse
+import os
+import csv
+def parse_fasta_stats(fasta_file):
+    """
+    Parses a FASTA file and calculates sequence statistics.
+    """
+    lengths = []
+    with open(fasta_file, 'r') as f:
+        sequence = []
+        for line in f:
+            line = line.strip()
+            if line.startswith(">"):
+                if sequence:  # Save the previous sequence length
+                    lengths.append(len(''.join(sequence)))
+                sequence = []  # Reset for the next sequence
+            else:
+                sequence.append(line)
+        if sequence:  # Save the last sequence length
+            lengths.append(len(''.join(sequence)))
+    # Calculate statistics
+    num_sequences = len(lengths)
+    if num_sequences > 0:
+        avg_length = sum(lengths) / num_sequences
+        min_length = min(lengths)
+        max_length = max(lengths)
+        length_diff = max_length - min_length
+        percent_diff = (length_diff / min_length * 100) if min_length > 0 else 0
+    else:
+        avg_length = min_length = max_length = length_diff = percent_diff = 0
+    return {
+        "num_sequences": num_sequences,
+        "min_length": min_length,
+        "max_length": max_length,
+        "avg_length": avg_length,
+        "length_diff": length_diff,
+        "percent_diff": percent_diff
+    }
+def process_fasta_directory(input_dir, output_csv):
+    """
+    Processes a directory of FASTA files and writes statistics to a CSV file.
+    """
+    results = []
+    for filename in os.listdir(input_dir):
+        if filename.endswith(".fasta"):
+            file_path = os.path.join(input_dir, filename)
+            stats = parse_fasta_stats(file_path)
+            results.append({
+                "file_name": filename,
+                "num_sequences": stats["num_sequences"],
+                "min_length": stats["min_length"],
+                "max_length": stats["max_length"],
+                "avg_length": stats["avg_length"],
+                "length_diff": stats["length_diff"],
+                "percent_diff": stats["percent_diff"]
+            })
+    # Write results to a CSV file
+    with open(output_csv, 'w', newline='') as csvfile:
+        fieldnames = ["file_name", "num_sequences", "min_length", "max_length", "avg_length", "length_diff",
+                      "percent_diff"]
+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(results)
+def main():
+    parser = argparse.ArgumentParser(description="Summarize sequence statistics for a directory of FASTA files.")
+    parser.add_argument("-input_dir", required=True, help="Directory containing FASTA files.")
+    parser.add_argument("-output_csv", required=True, help="Output CSV file to save statistics.")
+    args = parser.parse_args()
+    # Process the directory of FASTA files
+    print("Processing FASTA files...")
+    process_fasta_directory(args.input_dir, args.output_csv)
+    print(f"Statistics saved to {args.output_csv}")
+if __name__ == "__main__":
+    main()

PyamilySeq/PyamilySeq.py CHANGED Viewed

@@ -20,8 +20,8 @@ def run_cd_hit(options, input_file, clustering_output, clustering_mode):
         clustering_mode,
         '-i', input_file,
         '-o', clustering_output,
-        '-c', str(options.pident),
-        '-s', str(options.len_diff),
+        '-c', f"{float(options.pident):.2f}",
+        '-s', f"{float(options.len_diff):.2f}",
         '-T', str(options.threads),
         '-M', str(options.mem),
         '-d', "0",
@@ -54,16 +54,19 @@ def main():
                              help="Directory containing GFF/FASTA files - Use with -input_type separate/combined.")
     full_parser.add_argument("-input_fasta", required=False,
                              help="Input FASTA file - Use with - input_type fasta.")
-    full_parser.add_argument("-name_split", required=False,
-                             help="Substring to split filenames and extract genome names (e.g., '_combined.gff3') - Use with -input_type separate/combined.")
+    full_parser.add_argument("-name_split_gff", required=False,
+                             help="Substring to split filenames and extract genome names for gff files (e.g., '_combined.gff3') - Use with -input_type separate/combined.")
+    full_parser.add_argument("-name_split_fasta", required=False,
+                             help="Substring to split filenames and extract genome names for fasta files if named differently to paired gff files (e.g., '_dna.fasta') - Use with -input_type separate/combined.")
     full_parser.add_argument("-sequence_type", choices=['AA', 'DNA'], default="AA", required=False,
                              help="Clustering mode: 'DNA' or 'AA'.")
     full_parser.add_argument("-gene_ident", default="CDS", required=False,
                              help="Gene identifiers to extract sequences (e.g., 'CDS, tRNA').")
-    full_parser.add_argument("-c", type=float, dest="pident", default=0.90, required=False,
+    full_parser.add_argument("-c", type=str, dest="pident", default="0.90", required=False,
                              help="Sequence identity threshold for clustering (default: 0.90) - CD-HIT parameter '-c'.")
-    full_parser.add_argument("-s", type=float, dest="len_diff", default=0.80, required=False,
+    full_parser.add_argument("-s", type=str, dest="len_diff", default="0.80", required=False,
                              help="Length difference threshold for clustering (default: 0.80) - CD-HIT parameter '-s'.")
     full_parser.add_argument("-fast_mode", action="store_true", required=False,
                              help="Enable fast mode for CD-HIT (not recommended) - CD-HIT parameter '-g'.")
@@ -91,12 +94,12 @@ def main():
                                help="Gene groupings for 'Species' mode (default: '99,95,15').")
         subparser.add_argument("-genus_groups", default="1,2,3,4,5,6,7,8,9,10", required=False,
                                help="Gene groupings for 'Genus' mode (default: '1-10').")
-        subparser.add_argument("-w", default=None, dest="write_groups", required=False,
-                               help="Output gene groups as a single FASTA file (specify levels: e.g., '-w 99,95').")
-        subparser.add_argument("-wi", action="store_true", dest="write_individual_groups", required=False,
+        subparser.add_argument("-write_groups", default=None, dest="write_groups", required=False,
+                               help="Output gene groups as a single FASTA file (specify levels: e.g., '-w 99,95'). - triggers '-wig'.")
+        subparser.add_argument("-write_individual_groups", action="store_true", dest="write_individual_groups", required=False,
                                help="Output individual FASTA files for each group.")
-        subparser.add_argument("-a", action="store_true", dest="align_core", required=False,
-                               help="Align and concatenate sequences for 'core' groups.")
+        subparser.add_argument("-align", action="store_true", dest="align_core", required=False,
+                               help="Align and concatenate sequences for 'core' groups (those in 99-100% of genomes).")
         subparser.add_argument("-align_aa", action="store_true", required=False,
                                help="Align sequences as amino acids.")
         subparser.add_argument("-no_gpa", action="store_false", dest="gene_presence_absence_out", required=False,
@@ -115,6 +118,9 @@ def main():
     # Parse Arguments
     options = parser.parse_args()
+    if options.write_groups != None and options.write_individual_groups == False:
+        options.write_individual_groups = True
     # Example of conditional logic based on selected mode
     print(f"Running PyamilySeq {PyamilySeq_Version} in {options.run_mode} mode:")
     if options.run_mode == "Full" and options.verbose == True:
@@ -129,13 +135,13 @@ def main():
             sys.exit("Currently reclustering only works on Partial Mode.")
         required_full_mode = [options.input_type, options.pident, options.len_diff]
         if options.input_type != 'fasta':
-            required_full_mode.extend([options.input_dir, options.name_split])
+            required_full_mode.extend([options.input_dir, options.name_split_gff])
         if all(required_full_mode):
             # Proceed with the Full mode
             pass
         else:
             missing_options = [opt for opt in
-                               ['input_type', 'input_dir', 'name_split', 'clustering_format', 'pident', 'len_diff'] if
+                               ['input_type', 'input_dir', 'name_split_gff', 'clustering_format', 'pident', 'len_diff'] if
                                not options.__dict__.get(opt)]
             sys.exit(f"Missing required options for Full mode: {', '.join(missing_options)}")
         if options.align_core:
@@ -182,13 +188,13 @@ def main():
             elif options.sequence_type == 'AA':
                 clustering_mode = 'cd-hit'
             if options.fast_mode == True:
-                options.fast_mode = 0
+                options.fast_mode = 1
                 if options.verbose == True:
                     print("Running CD-HIT in fast mode.")
             else:
-                options.fast_mode = 1
+                options.fast_mode = 0
                 if options.verbose == True:
-                    print("Running CD-HIT in slow mode.")
+                    print("Running CD-HIT in accurate mode.")
         else:
             exit("cd-hit is not installed. Please install cd-hit to proceed.")
@@ -234,10 +240,10 @@ def main():
             translate = False
             file_to_cluster = combined_out_file
         if options.input_type == 'separate':
-            read_separate_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, translate)
+            read_separate_files(options.input_dir, options.name_split_gff, options.name_split_fasta, options.gene_ident, combined_out_file, translate, False)
             run_cd_hit(options, file_to_cluster, clustering_output, clustering_mode)
         elif options.input_type == 'combined':
-            read_combined_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, translate)
+            read_combined_files(options.input_dir, options.name_split_gff, options.gene_ident, combined_out_file, translate, False)
             run_cd_hit(options, file_to_cluster, clustering_output, clustering_mode)
         elif options.input_type == 'fasta':
             combined_out_file = options.input_fasta
@@ -276,6 +282,8 @@ def main():
         clustering_options = clustering_options()
     elif options.run_mode == 'Partial':
+        if not os.path.exists(output_path):
+            os.makedirs(output_path)
         class clustering_options:
             def __init__(self):
                 self.run_mode = options.run_mode

PyamilySeq/PyamilySeq_Genus.py CHANGED Viewed

@@ -23,8 +23,8 @@ def gene_presence_absence_output(options, genus_dict, pangenome_clusters_First_s
     gpa_outfile.write('"\n')
     for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
         average_sequences_per_genome = len(sequences) / len(pangenome_clusters_First_sorted[cluster])
-        gpa_outfile.write('"group_'+str(cluster)+'","","",'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
-                         '","","","","","","","","",""')
+        gpa_outfile.write('"group_'+str(cluster)+'","","","'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
+                         '","","","","","","","",""')
         for genus in genus_dict.keys():
@@ -34,7 +34,7 @@ def gene_presence_absence_output(options, genus_dict, pangenome_clusters_First_s
                 if value.split('_')[0] == genus:
                     tmp_list.append(value)
             if tmp_list:
-                full_out += ',"'+''.join(tmp_list)+'"'
+                full_out += ',"'+'  '.join(tmp_list)+'"'
             else:
                 full_out = ',""'
             gpa_outfile.write(full_out)

PyamilySeq/PyamilySeq_Species.py CHANGED Viewed

@@ -21,7 +21,7 @@ def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_
     gpa_outfile.write('"\n')
     for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
         average_sequences_per_genome = len(sequences) / len(pangenome_clusters_First_sorted[cluster])
-        gpa_outfile.write('"group_'+str(cluster)+'","","",'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
+        gpa_outfile.write('"group_'+str(cluster)+'","","","'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
                          '","","","","","","","","",""')
@@ -32,7 +32,7 @@ def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_
                 if value.split('|')[0] == genome:
                     tmp_list.append(value.split('|')[1])
             if tmp_list:
-                full_out += ',"'+'\t'.join(tmp_list)+'"'
+                full_out += ',"'+'  '.join(tmp_list)+'"'
             else:
                 full_out = ',""'
             gpa_outfile.write(full_out)
@@ -120,12 +120,14 @@ def calc_Second_only_core(cluster, Second_num, groups, cores):
 #@profile
 def calc_only_Second_only_core(cluster, Second_num, groups, cores): # only count the true storf onlies
-    groups_as_list = list(groups.values())
-    for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= Second_num <= fir):
-        res = idx
-    family_group = list(groups)[res]
-    cores['only_Second_core_' + family_group].append(cluster)
+    try:
+        groups_as_list = list(groups.values())
+        for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= Second_num <= fir):
+            res = idx
+        family_group = list(groups)[res]
+        cores['only_Second_core_' + family_group].append(cluster)
+    except UnboundLocalError:
+        sys.exit("Error in calc_only_Second_only_core")
 #@profile

PyamilySeq 1.0.1__py3-none-any.whl → 1.1.1__py3-none-any.whl

PyamilySeq 1.0.1py3-none-any.whl → 1.1.1py3-none-any.whl