PyamilySeq 1.0.1__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,108 @@
1
+ import argparse
2
+ from collections import defaultdict
3
+
4
+ def read_cd_hit_output(clstr_file):
5
+ """
6
+ Reads a CD-HIT .clstr file and extracts sequence clusters.
7
+ Returns a dictionary where keys are sequence headers and values are cluster IDs.
8
+ """
9
+ seq_to_cluster = {} # Maps sequence header -> cluster ID
10
+ cluster_id = 0 # Generic ID for clusters (since CD-HIT names don't matter)
11
+
12
+ with open(clstr_file, 'r') as f:
13
+ for line in f:
14
+ line = line.strip()
15
+ if line.startswith(">Cluster"):
16
+ cluster_id += 1 # Increment cluster ID
17
+ elif line:
18
+ parts = line.split('\t')
19
+ if len(parts) > 1:
20
+ seq_header = parts[1].split('>')[1].split('...')[0] # Extract sequence header
21
+ seq_to_cluster[seq_header] = cluster_id
22
+
23
+ return seq_to_cluster
24
+
25
+ def compare_cd_hit_clusters(file1, file2, output_file):
26
+ """
27
+ Compares two CD-HIT .clstr files to check if clusters are the same.
28
+ Writes the results to a TSV file.
29
+ """
30
+ # Read both clustering files
31
+ clusters1 = read_cd_hit_output(file1)
32
+ clusters2 = read_cd_hit_output(file2)
33
+
34
+ # Reverse mappings: cluster ID -> list of sequences
35
+ grouped_clusters1 = defaultdict(set)
36
+ grouped_clusters2 = defaultdict(set)
37
+
38
+ for seq, cluster_id in clusters1.items():
39
+ grouped_clusters1[cluster_id].add(seq)
40
+ for seq, cluster_id in clusters2.items():
41
+ grouped_clusters2[cluster_id].add(seq)
42
+
43
+ # Initialize metrics counters
44
+ cluster_name_changes = 0
45
+ sequence_shifts = 0
46
+ only_in_file1 = defaultdict(list)
47
+ only_in_file2 = defaultdict(list)
48
+ cluster_mismatches = defaultdict(list)
49
+
50
+ # Prepare data for the TSV output
51
+ tsv_data = []
52
+
53
+ # Track changes
54
+ for seq, cluster_id in clusters1.items():
55
+ if seq not in clusters2:
56
+ only_in_file1[cluster_id].append(seq)
57
+ tsv_data.append([seq, cluster_id, "NA", "Only in file1"])
58
+ elif clusters2[seq] != cluster_id:
59
+ # Sequence shifts: sequence in different clusters between files
60
+ sequence_shifts += 1
61
+ cluster_mismatches[seq].append((cluster_id, clusters2[seq]))
62
+ tsv_data.append([seq, cluster_id, clusters2[seq], "Mismatch"])
63
+
64
+ for seq, cluster_id in clusters2.items():
65
+ if seq not in clusters1:
66
+ only_in_file2[cluster_id].append(seq)
67
+ tsv_data.append([seq, "NA", cluster_id, "Only in file2"])
68
+ elif clusters1[seq] != cluster_id:
69
+ # Sequence shifts: sequence in different clusters between files
70
+ sequence_shifts += 1
71
+ cluster_mismatches[seq].append((clusters1[seq], cluster_id))
72
+ tsv_data.append([seq, clusters1[seq], cluster_id, "Mismatch"])
73
+
74
+ # Track cluster name changes (same sequences in different clusters)
75
+ for cluster_id1, seqs1 in grouped_clusters1.items():
76
+ for cluster_id2, seqs2 in grouped_clusters2.items():
77
+ if seqs1 == seqs2 and cluster_id1 != cluster_id2:
78
+ cluster_name_changes += 1
79
+ for seq in seqs1:
80
+ tsv_data.append([seq, cluster_id1, cluster_id2, "Cluster name change"])
81
+
82
+ # Print metrics
83
+ print("🔢 Clustering Comparison Metrics:")
84
+ print(f"Cluster name changes: {cluster_name_changes}")
85
+ print(f"Sequence shifts (sequences assigned to different clusters): {sequence_shifts}")
86
+ print(f"Sequences only in the first file: {len(only_in_file1)}")
87
+ print(f"Sequences only in the second file: {len(only_in_file2)}")
88
+ print()
89
+
90
+ # Write the results to a TSV file
91
+ with open(output_file, 'w') as out_file:
92
+ out_file.write("Sequence\tCluster ID (File 1)\tCluster ID (File 2)\tChange Type\n")
93
+ for row in tsv_data:
94
+ out_file.write("\t".join(map(str, row)) + "\n")
95
+
96
+ print(f"✅ Results have been written to {output_file}")
97
+
98
+ def main():
99
+ parser = argparse.ArgumentParser(description="Compare two CD-HIT .clstr files to check for clustering consistency.")
100
+ parser.add_argument("-file1", required=True, help="First CD-HIT .clstr file")
101
+ parser.add_argument("-file2", required=True, help="Second CD-HIT .clstr file")
102
+ parser.add_argument("-output", required=True, help="Output file (TSV format)")
103
+ args = parser.parse_args()
104
+
105
+ compare_cd_hit_clusters(args.file1, args.file2, args.output)
106
+
107
+ if __name__ == "__main__":
108
+ main()
@@ -1,33 +1,32 @@
1
1
  import argparse
2
- from collections import OrderedDict
3
- from collections import defaultdict
2
+ from collections import OrderedDict, defaultdict
4
3
 
5
4
  try:
6
5
  from .constants import *
7
6
  from .utils import *
8
- except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
7
+ except (ModuleNotFoundError, ImportError, NameError, TypeError):
9
8
  from constants import *
10
9
  from utils import *
11
10
 
12
11
 
13
12
  def categorise_percentage(percent):
14
13
  """Categorise the percentage of genomes with multicopy genes."""
15
- if 20 <= percent < 40:
16
- return "20-40%"
17
- elif 40 <= percent < 60:
18
- return "40-60%"
19
- elif 60 <= percent < 80:
20
- return "60-80%"
21
- elif 80 <= percent < 95:
22
- return "80-95%"
23
- elif 95 <= percent < 99:
24
- return "95-99%"
25
- elif 99 <= percent <= 100:
26
- return "99-100%"
14
+ categories = {
15
+ (20, 40): "20-40%",
16
+ (40, 60): "40-60%",
17
+ (60, 80): "60-80%",
18
+ (80, 95): "80-95%",
19
+ (95, 99): "95-99%",
20
+ (99, 100): "99-100%"
21
+ }
22
+ for (low, high), label in categories.items():
23
+ if low <= percent < high:
24
+ return label
27
25
  return None
28
26
 
29
- # Read cd-hit .clstr file and extract information
27
+
30
28
  def read_cd_hit_output(clustering_output):
29
+ """Parse CD-HIT .cluster file and extract clustering information."""
31
30
  clusters = OrderedDict()
32
31
 
33
32
  with open(clustering_output, 'r') as f:
@@ -42,10 +41,8 @@ def read_cd_hit_output(clustering_output):
42
41
  parts = line.split('\t')
43
42
  if len(parts) > 1:
44
43
  clustered_info = parts[1]
45
- length = clustered_info.split(',')[0]
46
- length = int(''.join(c for c in length if c.isdigit()))
47
- clustered_header = clustered_info.split('>')[1].split('...')[0]
48
- clustered_header = '>' + clustered_header
44
+ length = int(''.join(c for c in clustered_info.split(',')[0] if c.isdigit()))
45
+ clustered_header = '>' + clustered_info.split('>')[1].split('...')[0]
49
46
 
50
47
  if 'at ' in clustered_info and '%' in clustered_info.split('at ')[-1]:
51
48
  percent_identity = extract_identity(clustered_info)
@@ -63,12 +60,14 @@ def read_cd_hit_output(clustering_output):
63
60
  return clusters
64
61
 
65
62
 
66
- # Summarise the information for each cluster
67
- def summarise_clusters(options,clusters, output):
68
- multicopy_groups = defaultdict(int) # Counter for groups with multicopy genes
63
+ def summarise_clusters(options, clusters, output):
64
+ """Generate a detailed cluster summary report."""
65
+ multicopy_groups = defaultdict(int) # Counter for clusters with multicopy genes
69
66
 
70
67
  with open(output, 'w') as out_f:
71
- out_f.write("Cluster_ID\tNum_Sequences\tAvg_Length\tLength_Range\tAvg_Identity\tIdentity_Range\n")
68
+ out_f.write(
69
+ "Cluster_ID\tNum_Sequences\tNum_Genomes\tAvg_Length\tLength_Range\tAvg_Identity\tIdentity_Range\tGenomes_With_Multiple_Genes\tMulticopy_Percentage\n"
70
+ )
72
71
 
73
72
  for cluster_id, seqs in clusters.items():
74
73
  num_seqs = len(seqs)
@@ -81,82 +80,78 @@ def summarise_clusters(options,clusters, output):
81
80
  avg_identity = sum(identities) / num_seqs if num_seqs > 0 else 0
82
81
  identity_range = f"{min(identities):.2f}-{max(identities):.2f}" if num_seqs > 0 else "N/A"
83
82
 
84
- out_f.write(
85
- f"{cluster_id}\t{num_seqs}\t{avg_length:.2f}\t{length_range}\t{avg_identity:.2f}\t{identity_range}\n")
86
-
87
- # Count genomes with more than one gene
83
+ # Count genomes in cluster
88
84
  genome_to_gene_count = defaultdict(int)
89
85
  for seq in seqs:
90
- genome = seq['header'].split('|')[0].replace('>','')
86
+ genome = seq['header'].split('|')[0].replace('>', '')
91
87
  genome_to_gene_count[genome] += 1
92
88
 
89
+ num_genomes = len(genome_to_gene_count)
93
90
  num_genomes_with_multiple_genes = sum(1 for count in genome_to_gene_count.values() if count > 1)
91
+ multicopy_percentage = (num_genomes_with_multiple_genes / options.genome_num) * 100 if options.genome_num > 0 else 0
94
92
 
95
- # Calculate the percentage of genomes with multicopy genes
96
-
97
- multicopy_percentage = (num_genomes_with_multiple_genes / options.genome_num) * 100
93
+ # Categorize multicopy percentage
98
94
  category = categorise_percentage(multicopy_percentage)
99
95
  if category:
100
96
  multicopy_groups[category] += 1
101
97
 
102
- # Define the order of categories for printout
103
- category_order = ["20-40%", "40-60%", "60-80%", "80-95%", "95-99%", "99-100%"]
98
+ # Write detailed output for each cluster
99
+ out_f.write(
100
+ f"{cluster_id}\t{num_seqs}\t{num_genomes}\t{avg_length:.2f}\t{length_range}\t{avg_identity:.2f}\t{identity_range}\t"
101
+ f"{num_genomes_with_multiple_genes}\t{multicopy_percentage:.2f}\n"
102
+ )
104
103
 
105
- # Print the number of clusters with multicopy genes in each percentage range, in the correct order
104
+ # Define order for multicopy statistics output
105
+ category_order = ["20-40%", "40-60%", "60-80%", "80-95%", "95-99%", "99-100%"]
106
106
  for category in category_order:
107
- print(f"Number of clusters with multicopy genes in {category} range: {multicopy_groups[category]}")
107
+ print(f"Clusters with multicopy genes in {category} range: {multicopy_groups[category]}")
108
108
 
109
109
 
110
- # Main function to parse arguments and run the analysis
111
110
  def main():
112
- parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': Cluster-Summary - A tool to summarise CD-HIT clustering files.')
113
- ### Required Arguments
114
- required = parser.add_argument_group('Required Parameters')
115
- required.add_argument('-input_clstr', action="store", dest="input_clstr",
116
- help='Input CD-HIT .clstr file',
117
- required=True)
118
- required.add_argument('-output', action="store", dest="output",
119
- help="Output TSV file to store cluster summaries - Will add '.tsv' if not provided by user",
120
- required=True)
121
- required.add_argument('-genome_num', action='store', dest='genome_num', type=int,
122
- help='The total number of genomes must be provide',
123
- required=True)
124
- #required.add_argument("-clustering_format", action="store", dest="clustering_format", choices=['CD-HIT','TSV','CSV'],
125
- # help="Clustering format to use: CD-HIT or TSV (MMseqs2, BLAST, DIAMOND) / CSV edge-list file (Node1\tNode2).",
126
- # required=True)
111
+ """Main function to parse arguments and process clustering files."""
112
+ parser = argparse.ArgumentParser(
113
+ description='PyamilySeq ' + PyamilySeq_Version + ': Cluster-Summary - A tool to summarise CD-HIT clustering files.')
127
114
 
115
+ # Required Arguments
116
+ required = parser.add_argument_group('Required Parameters')
117
+ required.add_argument('-input_cluster', action="store", dest="input_cluster", required=True,
118
+ help='Input CD-HIT .cluster file')
119
+ required.add_argument('-output', action="store", dest="output", required=True,
120
+ help="Output TSV file to store cluster summaries - Will add '.tsv' if not provided by user")
121
+ required.add_argument('-genome_num', action='store', dest='genome_num', type=int, required=True,
122
+ help='Total number of genomes in dataset')
123
+
124
+ # Optional Arguments
128
125
  optional = parser.add_argument_group('Optional Arguments')
129
126
  optional.add_argument('-output_dir', action="store", dest="output_dir",
130
- help='Default: Same as input file',
131
- required=False)
127
+ help='Default: Same as input file', required=False)
132
128
 
133
129
  misc = parser.add_argument_group("Misc Parameters")
134
130
  misc.add_argument("-verbose", action="store_true", dest="verbose",
135
- help="Print verbose output.",
136
- required=False)
131
+ help="Print verbose output.", required=False)
137
132
  misc.add_argument("-v", "--version", action="version",
138
133
  version=f"PyamilySeq: Group-Summary version {PyamilySeq_Version} - Exiting",
139
134
  help="Print out version number and exit")
140
135
 
141
-
142
136
  options = parser.parse_args()
143
- print("Running PyamilySeq " + PyamilySeq_Version+ ": Group-Summary ")
137
+ print("Running PyamilySeq " + PyamilySeq_Version + ": Group-Summary ")
144
138
 
145
- ### File handling
146
- options.input_clstr = fix_path(options.input_clstr)
139
+ # File handling
140
+ options.input_cluster = fix_path(options.input_cluster)
147
141
  if options.output_dir is None:
148
- options.output_dir = os.path.dirname(os.path.abspath(options.input_clstr))
142
+ options.output_dir = os.path.dirname(os.path.abspath(options.input_cluster))
149
143
  output_path = os.path.abspath(options.output_dir)
150
144
  if not os.path.exists(output_path):
151
145
  os.makedirs(output_path)
146
+
152
147
  output_name = options.output
153
148
  if not output_name.endswith('.tsv'):
154
149
  output_name += '.tsv'
155
150
  output_file_path = os.path.join(output_path, output_name)
156
- ###
157
151
 
158
- clusters = read_cd_hit_output(options.input_clstr)
159
- summarise_clusters(options,clusters, output_file_path)
152
+ # Process clusters and generate summary
153
+ clusters = read_cd_hit_output(options.input_cluster)
154
+ summarise_clusters(options, clusters, output_file_path)
160
155
 
161
156
 
162
157
  if __name__ == "__main__":
@@ -0,0 +1,83 @@
1
+ import argparse
2
+ import os
3
+ import csv
4
+
5
+
6
+ def parse_fasta(fasta_file):
7
+ """
8
+ Parses a FASTA file and returns a dictionary of gene IDs and sequences.
9
+ """
10
+ sequences = {}
11
+ with open(fasta_file, 'r') as f:
12
+ gene_id = None
13
+ sequence = []
14
+ for line in f:
15
+ line = line.strip()
16
+ if line.startswith(">"):
17
+ if gene_id: # Save the previous gene
18
+ sequences[gene_id] = ''.join(sequence)
19
+ gene_id = line[1:].split()[0].split('|')[1].replace('ENSB_','') # Extract the gene ID after ">"
20
+ sequence = []
21
+ else:
22
+ sequence.append(line)
23
+ if gene_id: # Save the last gene
24
+ sequences[gene_id] = ''.join(sequence)
25
+ return sequences
26
+
27
+
28
+ def parse_csv(csv_file):
29
+ """
30
+ Parses a CSV file to extract group IDs and gene IDs (skipping the first line).
31
+ """
32
+ groups = {}
33
+ with open(csv_file, 'r') as f:
34
+ reader = csv.reader(f, delimiter=',') # Assuming tab-delimited CSV
35
+ next(reader) # Skip the first line
36
+ for row in reader:
37
+ group_id = row[0]
38
+ gene_ids = row[14:] # Read from column 14 onward
39
+ gene_ids = [gene.strip() for genes in gene_ids for gene in genes.split(';') if
40
+ gene.strip()] # Flatten and clean
41
+ groups[group_id] = gene_ids
42
+ return groups
43
+
44
+
45
+ def write_group_fastas(groups, sequences, output_dir):
46
+ """
47
+ Writes individual FASTA files for each group with the relevant sequences.
48
+ """
49
+ if not os.path.exists(output_dir):
50
+ os.makedirs(output_dir)
51
+
52
+ for group_id, gene_ids in groups.items():
53
+ group_file = os.path.join(output_dir, f"{group_id}.fasta")
54
+ with open(group_file, 'w') as f:
55
+ for gene_id in gene_ids:
56
+ if gene_id in sequences:
57
+ f.write(f">{gene_id}\n{sequences[gene_id]}\n")
58
+ else:
59
+ print(f"Warning: Gene ID {gene_id} not found in FASTA file.")
60
+
61
+
62
+ def main():
63
+ parser = argparse.ArgumentParser(description="Process FASTA and CSV files to create grouped FASTA outputs.")
64
+ parser.add_argument("-fasta", required=True, help="Input FASTA file containing gene sequences.")
65
+ parser.add_argument("-csv", required=True, help="Input CSV file containing group and gene information.")
66
+ parser.add_argument("-output_dir", required=True, help="Directory to save the grouped FASTA files.")
67
+
68
+ args = parser.parse_args()
69
+
70
+ # Parse the input files
71
+ print("Parsing FASTA file...")
72
+ sequences = parse_fasta(args.fasta)
73
+ print("Parsing CSV file...")
74
+ groups = parse_csv(args.csv)
75
+
76
+ # Write the grouped FASTA files
77
+ print("Writing grouped FASTA files...")
78
+ write_group_fastas(groups, sequences, args.output_dir)
79
+ print("Process completed successfully.")
80
+
81
+
82
+ if __name__ == "__main__":
83
+ main()
@@ -0,0 +1,87 @@
1
+ import argparse
2
+ import os
3
+ import csv
4
+
5
+
6
+ def parse_fasta_stats(fasta_file):
7
+ """
8
+ Parses a FASTA file and calculates sequence statistics.
9
+ """
10
+ lengths = []
11
+ with open(fasta_file, 'r') as f:
12
+ sequence = []
13
+ for line in f:
14
+ line = line.strip()
15
+ if line.startswith(">"):
16
+ if sequence: # Save the previous sequence length
17
+ lengths.append(len(''.join(sequence)))
18
+ sequence = [] # Reset for the next sequence
19
+ else:
20
+ sequence.append(line)
21
+ if sequence: # Save the last sequence length
22
+ lengths.append(len(''.join(sequence)))
23
+
24
+ # Calculate statistics
25
+ num_sequences = len(lengths)
26
+ if num_sequences > 0:
27
+ avg_length = sum(lengths) / num_sequences
28
+ min_length = min(lengths)
29
+ max_length = max(lengths)
30
+ length_diff = max_length - min_length
31
+ percent_diff = (length_diff / min_length * 100) if min_length > 0 else 0
32
+ else:
33
+ avg_length = min_length = max_length = length_diff = percent_diff = 0
34
+
35
+ return {
36
+ "num_sequences": num_sequences,
37
+ "min_length": min_length,
38
+ "max_length": max_length,
39
+ "avg_length": avg_length,
40
+ "length_diff": length_diff,
41
+ "percent_diff": percent_diff
42
+ }
43
+
44
+
45
+ def process_fasta_directory(input_dir, output_csv):
46
+ """
47
+ Processes a directory of FASTA files and writes statistics to a CSV file.
48
+ """
49
+ results = []
50
+ for filename in os.listdir(input_dir):
51
+ if filename.endswith(".fasta"):
52
+ file_path = os.path.join(input_dir, filename)
53
+ stats = parse_fasta_stats(file_path)
54
+ results.append({
55
+ "file_name": filename,
56
+ "num_sequences": stats["num_sequences"],
57
+ "min_length": stats["min_length"],
58
+ "max_length": stats["max_length"],
59
+ "avg_length": stats["avg_length"],
60
+ "length_diff": stats["length_diff"],
61
+ "percent_diff": stats["percent_diff"]
62
+ })
63
+
64
+ # Write results to a CSV file
65
+ with open(output_csv, 'w', newline='') as csvfile:
66
+ fieldnames = ["file_name", "num_sequences", "min_length", "max_length", "avg_length", "length_diff",
67
+ "percent_diff"]
68
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
69
+ writer.writeheader()
70
+ writer.writerows(results)
71
+
72
+
73
+ def main():
74
+ parser = argparse.ArgumentParser(description="Summarize sequence statistics for a directory of FASTA files.")
75
+ parser.add_argument("-input_dir", required=True, help="Directory containing FASTA files.")
76
+ parser.add_argument("-output_csv", required=True, help="Output CSV file to save statistics.")
77
+
78
+ args = parser.parse_args()
79
+
80
+ # Process the directory of FASTA files
81
+ print("Processing FASTA files...")
82
+ process_fasta_directory(args.input_dir, args.output_csv)
83
+ print(f"Statistics saved to {args.output_csv}")
84
+
85
+
86
+ if __name__ == "__main__":
87
+ main()
PyamilySeq/PyamilySeq.py CHANGED
@@ -20,8 +20,8 @@ def run_cd_hit(options, input_file, clustering_output, clustering_mode):
20
20
  clustering_mode,
21
21
  '-i', input_file,
22
22
  '-o', clustering_output,
23
- '-c', str(options.pident),
24
- '-s', str(options.len_diff),
23
+ '-c', f"{float(options.pident):.2f}",
24
+ '-s', f"{float(options.len_diff):.2f}",
25
25
  '-T', str(options.threads),
26
26
  '-M', str(options.mem),
27
27
  '-d', "0",
@@ -54,16 +54,19 @@ def main():
54
54
  help="Directory containing GFF/FASTA files - Use with -input_type separate/combined.")
55
55
  full_parser.add_argument("-input_fasta", required=False,
56
56
  help="Input FASTA file - Use with - input_type fasta.")
57
- full_parser.add_argument("-name_split", required=False,
58
- help="Substring to split filenames and extract genome names (e.g., '_combined.gff3') - Use with -input_type separate/combined.")
57
+ full_parser.add_argument("-name_split_gff", required=False,
58
+ help="Substring to split filenames and extract genome names for gff files (e.g., '_combined.gff3') - Use with -input_type separate/combined.")
59
+ full_parser.add_argument("-name_split_fasta", required=False,
60
+ help="Substring to split filenames and extract genome names for fasta files if named differently to paired gff files (e.g., '_dna.fasta') - Use with -input_type separate/combined.")
59
61
  full_parser.add_argument("-sequence_type", choices=['AA', 'DNA'], default="AA", required=False,
60
62
  help="Clustering mode: 'DNA' or 'AA'.")
61
63
  full_parser.add_argument("-gene_ident", default="CDS", required=False,
62
64
  help="Gene identifiers to extract sequences (e.g., 'CDS, tRNA').")
63
- full_parser.add_argument("-c", type=float, dest="pident", default=0.90, required=False,
65
+ full_parser.add_argument("-c", type=str, dest="pident", default="0.90", required=False,
64
66
  help="Sequence identity threshold for clustering (default: 0.90) - CD-HIT parameter '-c'.")
65
- full_parser.add_argument("-s", type=float, dest="len_diff", default=0.80, required=False,
67
+ full_parser.add_argument("-s", type=str, dest="len_diff", default="0.80", required=False,
66
68
  help="Length difference threshold for clustering (default: 0.80) - CD-HIT parameter '-s'.")
69
+
67
70
  full_parser.add_argument("-fast_mode", action="store_true", required=False,
68
71
  help="Enable fast mode for CD-HIT (not recommended) - CD-HIT parameter '-g'.")
69
72
 
@@ -91,12 +94,12 @@ def main():
91
94
  help="Gene groupings for 'Species' mode (default: '99,95,15').")
92
95
  subparser.add_argument("-genus_groups", default="1,2,3,4,5,6,7,8,9,10", required=False,
93
96
  help="Gene groupings for 'Genus' mode (default: '1-10').")
94
- subparser.add_argument("-w", default=None, dest="write_groups", required=False,
95
- help="Output gene groups as a single FASTA file (specify levels: e.g., '-w 99,95').")
96
- subparser.add_argument("-wi", action="store_true", dest="write_individual_groups", required=False,
97
+ subparser.add_argument("-write_groups", default=None, dest="write_groups", required=False,
98
+ help="Output gene groups as a single FASTA file (specify levels: e.g., '-w 99,95'). - triggers '-wig'.")
99
+ subparser.add_argument("-write_individual_groups", action="store_true", dest="write_individual_groups", required=False,
97
100
  help="Output individual FASTA files for each group.")
98
- subparser.add_argument("-a", action="store_true", dest="align_core", required=False,
99
- help="Align and concatenate sequences for 'core' groups.")
101
+ subparser.add_argument("-align", action="store_true", dest="align_core", required=False,
102
+ help="Align and concatenate sequences for 'core' groups (those in 99-100% of genomes).")
100
103
  subparser.add_argument("-align_aa", action="store_true", required=False,
101
104
  help="Align sequences as amino acids.")
102
105
  subparser.add_argument("-no_gpa", action="store_false", dest="gene_presence_absence_out", required=False,
@@ -115,6 +118,9 @@ def main():
115
118
  # Parse Arguments
116
119
  options = parser.parse_args()
117
120
 
121
+ if options.write_groups != None and options.write_individual_groups == False:
122
+ options.write_individual_groups = True
123
+
118
124
  # Example of conditional logic based on selected mode
119
125
  print(f"Running PyamilySeq {PyamilySeq_Version} in {options.run_mode} mode:")
120
126
  if options.run_mode == "Full" and options.verbose == True:
@@ -129,13 +135,13 @@ def main():
129
135
  sys.exit("Currently reclustering only works on Partial Mode.")
130
136
  required_full_mode = [options.input_type, options.pident, options.len_diff]
131
137
  if options.input_type != 'fasta':
132
- required_full_mode.extend([options.input_dir, options.name_split])
138
+ required_full_mode.extend([options.input_dir, options.name_split_gff])
133
139
  if all(required_full_mode):
134
140
  # Proceed with the Full mode
135
141
  pass
136
142
  else:
137
143
  missing_options = [opt for opt in
138
- ['input_type', 'input_dir', 'name_split', 'clustering_format', 'pident', 'len_diff'] if
144
+ ['input_type', 'input_dir', 'name_split_gff', 'clustering_format', 'pident', 'len_diff'] if
139
145
  not options.__dict__.get(opt)]
140
146
  sys.exit(f"Missing required options for Full mode: {', '.join(missing_options)}")
141
147
  if options.align_core:
@@ -182,13 +188,13 @@ def main():
182
188
  elif options.sequence_type == 'AA':
183
189
  clustering_mode = 'cd-hit'
184
190
  if options.fast_mode == True:
185
- options.fast_mode = 0
191
+ options.fast_mode = 1
186
192
  if options.verbose == True:
187
193
  print("Running CD-HIT in fast mode.")
188
194
  else:
189
- options.fast_mode = 1
195
+ options.fast_mode = 0
190
196
  if options.verbose == True:
191
- print("Running CD-HIT in slow mode.")
197
+ print("Running CD-HIT in accurate mode.")
192
198
  else:
193
199
  exit("cd-hit is not installed. Please install cd-hit to proceed.")
194
200
 
@@ -234,10 +240,10 @@ def main():
234
240
  translate = False
235
241
  file_to_cluster = combined_out_file
236
242
  if options.input_type == 'separate':
237
- read_separate_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, translate)
243
+ read_separate_files(options.input_dir, options.name_split_gff, options.name_split_fasta, options.gene_ident, combined_out_file, translate, False)
238
244
  run_cd_hit(options, file_to_cluster, clustering_output, clustering_mode)
239
245
  elif options.input_type == 'combined':
240
- read_combined_files(options.input_dir, options.name_split, options.gene_ident, combined_out_file, translate)
246
+ read_combined_files(options.input_dir, options.name_split_gff, options.gene_ident, combined_out_file, translate, False)
241
247
  run_cd_hit(options, file_to_cluster, clustering_output, clustering_mode)
242
248
  elif options.input_type == 'fasta':
243
249
  combined_out_file = options.input_fasta
@@ -276,6 +282,8 @@ def main():
276
282
  clustering_options = clustering_options()
277
283
 
278
284
  elif options.run_mode == 'Partial':
285
+ if not os.path.exists(output_path):
286
+ os.makedirs(output_path)
279
287
  class clustering_options:
280
288
  def __init__(self):
281
289
  self.run_mode = options.run_mode
@@ -23,8 +23,8 @@ def gene_presence_absence_output(options, genus_dict, pangenome_clusters_First_s
23
23
  gpa_outfile.write('"\n')
24
24
  for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
25
25
  average_sequences_per_genome = len(sequences) / len(pangenome_clusters_First_sorted[cluster])
26
- gpa_outfile.write('"group_'+str(cluster)+'","","",'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
27
- '","","","","","","","","",""')
26
+ gpa_outfile.write('"group_'+str(cluster)+'","","","'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
27
+ '","","","","","","","",""')
28
28
 
29
29
 
30
30
  for genus in genus_dict.keys():
@@ -34,7 +34,7 @@ def gene_presence_absence_output(options, genus_dict, pangenome_clusters_First_s
34
34
  if value.split('_')[0] == genus:
35
35
  tmp_list.append(value)
36
36
  if tmp_list:
37
- full_out += ',"'+''.join(tmp_list)+'"'
37
+ full_out += ',"'+' '.join(tmp_list)+'"'
38
38
  else:
39
39
  full_out = ',""'
40
40
  gpa_outfile.write(full_out)
@@ -21,7 +21,7 @@ def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_
21
21
  gpa_outfile.write('"\n')
22
22
  for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
23
23
  average_sequences_per_genome = len(sequences) / len(pangenome_clusters_First_sorted[cluster])
24
- gpa_outfile.write('"group_'+str(cluster)+'","","",'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
24
+ gpa_outfile.write('"group_'+str(cluster)+'","","","'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
25
25
  '","","","","","","","","",""')
26
26
 
27
27
 
@@ -32,7 +32,7 @@ def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_
32
32
  if value.split('|')[0] == genome:
33
33
  tmp_list.append(value.split('|')[1])
34
34
  if tmp_list:
35
- full_out += ',"'+'\t'.join(tmp_list)+'"'
35
+ full_out += ',"'+' '.join(tmp_list)+'"'
36
36
  else:
37
37
  full_out = ',""'
38
38
  gpa_outfile.write(full_out)
@@ -120,12 +120,14 @@ def calc_Second_only_core(cluster, Second_num, groups, cores):
120
120
 
121
121
  #@profile
122
122
  def calc_only_Second_only_core(cluster, Second_num, groups, cores): # only count the true storf onlies
123
- groups_as_list = list(groups.values())
124
- for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= Second_num <= fir):
125
- res = idx
126
- family_group = list(groups)[res]
127
- cores['only_Second_core_' + family_group].append(cluster)
128
-
123
+ try:
124
+ groups_as_list = list(groups.values())
125
+ for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= Second_num <= fir):
126
+ res = idx
127
+ family_group = list(groups)[res]
128
+ cores['only_Second_core_' + family_group].append(cluster)
129
+ except UnboundLocalError:
130
+ sys.exit("Error in calc_only_Second_only_core")
129
131
 
130
132
 
131
133
  #@profile