PyamilySeq 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,108 @@
1
+ import argparse
2
+ from collections import defaultdict
3
+
4
+ def read_cd_hit_output(clstr_file):
5
+ """
6
+ Reads a CD-HIT .clstr file and extracts sequence clusters.
7
+ Returns a dictionary where keys are sequence headers and values are cluster IDs.
8
+ """
9
+ seq_to_cluster = {} # Maps sequence header -> cluster ID
10
+ cluster_id = 0 # Generic ID for clusters (since CD-HIT names don't matter)
11
+
12
+ with open(clstr_file, 'r') as f:
13
+ for line in f:
14
+ line = line.strip()
15
+ if line.startswith(">Cluster"):
16
+ cluster_id += 1 # Increment cluster ID
17
+ elif line:
18
+ parts = line.split('\t')
19
+ if len(parts) > 1:
20
+ seq_header = parts[1].split('>')[1].split('...')[0] # Extract sequence header
21
+ seq_to_cluster[seq_header] = cluster_id
22
+
23
+ return seq_to_cluster
24
+
25
+ def compare_cd_hit_clusters(file1, file2, output_file):
26
+ """
27
+ Compares two CD-HIT .clstr files to check if clusters are the same.
28
+ Writes the results to a TSV file.
29
+ """
30
+ # Read both clustering files
31
+ clusters1 = read_cd_hit_output(file1)
32
+ clusters2 = read_cd_hit_output(file2)
33
+
34
+ # Reverse mappings: cluster ID -> list of sequences
35
+ grouped_clusters1 = defaultdict(set)
36
+ grouped_clusters2 = defaultdict(set)
37
+
38
+ for seq, cluster_id in clusters1.items():
39
+ grouped_clusters1[cluster_id].add(seq)
40
+ for seq, cluster_id in clusters2.items():
41
+ grouped_clusters2[cluster_id].add(seq)
42
+
43
+ # Initialize metrics counters
44
+ cluster_name_changes = 0
45
+ sequence_shifts = 0
46
+ only_in_file1 = defaultdict(list)
47
+ only_in_file2 = defaultdict(list)
48
+ cluster_mismatches = defaultdict(list)
49
+
50
+ # Prepare data for the TSV output
51
+ tsv_data = []
52
+
53
+ # Track changes
54
+ for seq, cluster_id in clusters1.items():
55
+ if seq not in clusters2:
56
+ only_in_file1[cluster_id].append(seq)
57
+ tsv_data.append([seq, cluster_id, "NA", "Only in file1"])
58
+ elif clusters2[seq] != cluster_id:
59
+ # Sequence shifts: sequence in different clusters between files
60
+ sequence_shifts += 1
61
+ cluster_mismatches[seq].append((cluster_id, clusters2[seq]))
62
+ tsv_data.append([seq, cluster_id, clusters2[seq], "Mismatch"])
63
+
64
+ for seq, cluster_id in clusters2.items():
65
+ if seq not in clusters1:
66
+ only_in_file2[cluster_id].append(seq)
67
+ tsv_data.append([seq, "NA", cluster_id, "Only in file2"])
68
+ elif clusters1[seq] != cluster_id:
69
+ # Sequence shifts: sequence in different clusters between files
70
+ sequence_shifts += 1
71
+ cluster_mismatches[seq].append((clusters1[seq], cluster_id))
72
+ tsv_data.append([seq, clusters1[seq], cluster_id, "Mismatch"])
73
+
74
+ # Track cluster name changes (same sequences in different clusters)
75
+ for cluster_id1, seqs1 in grouped_clusters1.items():
76
+ for cluster_id2, seqs2 in grouped_clusters2.items():
77
+ if seqs1 == seqs2 and cluster_id1 != cluster_id2:
78
+ cluster_name_changes += 1
79
+ for seq in seqs1:
80
+ tsv_data.append([seq, cluster_id1, cluster_id2, "Cluster name change"])
81
+
82
+ # Print metrics
83
+ print("🔢 Clustering Comparison Metrics:")
84
+ print(f"Cluster name changes: {cluster_name_changes}")
85
+ print(f"Sequence shifts (sequences assigned to different clusters): {sequence_shifts}")
86
+ print(f"Sequences only in the first file: {len(only_in_file1)}")
87
+ print(f"Sequences only in the second file: {len(only_in_file2)}")
88
+ print()
89
+
90
+ # Write the results to a TSV file
91
+ with open(output_file, 'w') as out_file:
92
+ out_file.write("Sequence\tCluster ID (File 1)\tCluster ID (File 2)\tChange Type\n")
93
+ for row in tsv_data:
94
+ out_file.write("\t".join(map(str, row)) + "\n")
95
+
96
+ print(f"✅ Results have been written to {output_file}")
97
+
98
+ def main():
99
+ parser = argparse.ArgumentParser(description="Compare two CD-HIT .clstr files to check for clustering consistency.")
100
+ parser.add_argument("-file1", required=True, help="First CD-HIT .clstr file")
101
+ parser.add_argument("-file2", required=True, help="Second CD-HIT .clstr file")
102
+ parser.add_argument("-output", required=True, help="Output file (TSV format)")
103
+ args = parser.parse_args()
104
+
105
+ compare_cd_hit_clusters(args.file1, args.file2, args.output)
106
+
107
+ if __name__ == "__main__":
108
+ main()
@@ -1,33 +1,32 @@
1
1
  import argparse
2
- from collections import OrderedDict
3
- from collections import defaultdict
2
+ from collections import OrderedDict, defaultdict
4
3
 
5
4
  try:
6
5
  from .constants import *
7
6
  from .utils import *
8
- except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
7
+ except (ModuleNotFoundError, ImportError, NameError, TypeError):
9
8
  from constants import *
10
9
  from utils import *
11
10
 
12
11
 
13
12
  def categorise_percentage(percent):
14
13
  """Categorise the percentage of genomes with multicopy genes."""
15
- if 20 <= percent < 40:
16
- return "20-40%"
17
- elif 40 <= percent < 60:
18
- return "40-60%"
19
- elif 60 <= percent < 80:
20
- return "60-80%"
21
- elif 80 <= percent < 95:
22
- return "80-95%"
23
- elif 95 <= percent < 99:
24
- return "95-99%"
25
- elif 99 <= percent <= 100:
26
- return "99-100%"
14
+ categories = {
15
+ (20, 40): "20-40%",
16
+ (40, 60): "40-60%",
17
+ (60, 80): "60-80%",
18
+ (80, 95): "80-95%",
19
+ (95, 99): "95-99%",
20
+ (99, 100): "99-100%"
21
+ }
22
+ for (low, high), label in categories.items():
23
+ if low <= percent < high:
24
+ return label
27
25
  return None
28
26
 
29
- # Read cd-hit .clstr file and extract information
27
+
30
28
  def read_cd_hit_output(clustering_output):
29
+ """Parse CD-HIT .cluster file and extract clustering information."""
31
30
  clusters = OrderedDict()
32
31
 
33
32
  with open(clustering_output, 'r') as f:
@@ -42,10 +41,8 @@ def read_cd_hit_output(clustering_output):
42
41
  parts = line.split('\t')
43
42
  if len(parts) > 1:
44
43
  clustered_info = parts[1]
45
- length = clustered_info.split(',')[0]
46
- length = int(''.join(c for c in length if c.isdigit()))
47
- clustered_header = clustered_info.split('>')[1].split('...')[0]
48
- clustered_header = '>' + clustered_header
44
+ length = int(''.join(c for c in clustered_info.split(',')[0] if c.isdigit()))
45
+ clustered_header = '>' + clustered_info.split('>')[1].split('...')[0]
49
46
 
50
47
  if 'at ' in clustered_info and '%' in clustered_info.split('at ')[-1]:
51
48
  percent_identity = extract_identity(clustered_info)
@@ -63,12 +60,14 @@ def read_cd_hit_output(clustering_output):
63
60
  return clusters
64
61
 
65
62
 
66
- # Summarise the information for each cluster
67
- def summarise_clusters(options,clusters, output):
68
- multicopy_groups = defaultdict(int) # Counter for groups with multicopy genes
63
+ def summarise_clusters(options, clusters, output):
64
+ """Generate a detailed cluster summary report."""
65
+ multicopy_groups = defaultdict(int) # Counter for clusters with multicopy genes
69
66
 
70
67
  with open(output, 'w') as out_f:
71
- out_f.write("Cluster_ID\tNum_Sequences\tAvg_Length\tLength_Range\tAvg_Identity\tIdentity_Range\n")
68
+ out_f.write(
69
+ "Cluster_ID\tNum_Sequences\tNum_Genomes\tAvg_Length\tLength_Range\tAvg_Identity\tIdentity_Range\tGenomes_With_Multiple_Genes\tMulticopy_Percentage\n"
70
+ )
72
71
 
73
72
  for cluster_id, seqs in clusters.items():
74
73
  num_seqs = len(seqs)
@@ -81,82 +80,78 @@ def summarise_clusters(options,clusters, output):
81
80
  avg_identity = sum(identities) / num_seqs if num_seqs > 0 else 0
82
81
  identity_range = f"{min(identities):.2f}-{max(identities):.2f}" if num_seqs > 0 else "N/A"
83
82
 
84
- out_f.write(
85
- f"{cluster_id}\t{num_seqs}\t{avg_length:.2f}\t{length_range}\t{avg_identity:.2f}\t{identity_range}\n")
86
-
87
- # Count genomes with more than one gene
83
+ # Count genomes in cluster
88
84
  genome_to_gene_count = defaultdict(int)
89
85
  for seq in seqs:
90
- genome = seq['header'].split('|')[0].replace('>','')
86
+ genome = seq['header'].split('|')[0].replace('>', '')
91
87
  genome_to_gene_count[genome] += 1
92
88
 
89
+ num_genomes = len(genome_to_gene_count)
93
90
  num_genomes_with_multiple_genes = sum(1 for count in genome_to_gene_count.values() if count > 1)
91
+ multicopy_percentage = (num_genomes_with_multiple_genes / options.genome_num) * 100 if options.genome_num > 0 else 0
94
92
 
95
- # Calculate the percentage of genomes with multicopy genes
96
-
97
- multicopy_percentage = (num_genomes_with_multiple_genes / options.genome_num) * 100
93
+ # Categorize multicopy percentage
98
94
  category = categorise_percentage(multicopy_percentage)
99
95
  if category:
100
96
  multicopy_groups[category] += 1
101
97
 
102
- # Define the order of categories for printout
103
- category_order = ["20-40%", "40-60%", "60-80%", "80-95%", "95-99%", "99-100%"]
98
+ # Write detailed output for each cluster
99
+ out_f.write(
100
+ f"{cluster_id}\t{num_seqs}\t{num_genomes}\t{avg_length:.2f}\t{length_range}\t{avg_identity:.2f}\t{identity_range}\t"
101
+ f"{num_genomes_with_multiple_genes}\t{multicopy_percentage:.2f}\n"
102
+ )
104
103
 
105
- # Print the number of clusters with multicopy genes in each percentage range, in the correct order
104
+ # Define order for multicopy statistics output
105
+ category_order = ["20-40%", "40-60%", "60-80%", "80-95%", "95-99%", "99-100%"]
106
106
  for category in category_order:
107
- print(f"Number of clusters with multicopy genes in {category} range: {multicopy_groups[category]}")
107
+ print(f"Clusters with multicopy genes in {category} range: {multicopy_groups[category]}")
108
108
 
109
109
 
110
- # Main function to parse arguments and run the analysis
111
110
  def main():
112
- parser = argparse.ArgumentParser(description='PyamilySeq ' + PyamilySeq_Version + ': Cluster-Summary - A tool to summarise CD-HIT clustering files.')
113
- ### Required Arguments
114
- required = parser.add_argument_group('Required Parameters')
115
- required.add_argument('-input_clstr', action="store", dest="input_clstr",
116
- help='Input CD-HIT .clstr file',
117
- required=True)
118
- required.add_argument('-output', action="store", dest="output",
119
- help="Output TSV file to store cluster summaries - Will add '.tsv' if not provided by user",
120
- required=True)
121
- required.add_argument('-genome_num', action='store', dest='genome_num', type=int,
122
- help='The total number of genomes must be provide',
123
- required=True)
124
- #required.add_argument("-clustering_format", action="store", dest="clustering_format", choices=['CD-HIT','TSV','CSV'],
125
- # help="Clustering format to use: CD-HIT or TSV (MMseqs2, BLAST, DIAMOND) / CSV edge-list file (Node1\tNode2).",
126
- # required=True)
111
+ """Main function to parse arguments and process clustering files."""
112
+ parser = argparse.ArgumentParser(
113
+ description='PyamilySeq ' + PyamilySeq_Version + ': Cluster-Summary - A tool to summarise CD-HIT clustering files.')
127
114
 
115
+ # Required Arguments
116
+ required = parser.add_argument_group('Required Parameters')
117
+ required.add_argument('-input_cluster', action="store", dest="input_cluster", required=True,
118
+ help='Input CD-HIT .cluster file')
119
+ required.add_argument('-output', action="store", dest="output", required=True,
120
+ help="Output TSV file to store cluster summaries - Will add '.tsv' if not provided by user")
121
+ required.add_argument('-genome_num', action='store', dest='genome_num', type=int, required=True,
122
+ help='Total number of genomes in dataset')
123
+
124
+ # Optional Arguments
128
125
  optional = parser.add_argument_group('Optional Arguments')
129
126
  optional.add_argument('-output_dir', action="store", dest="output_dir",
130
- help='Default: Same as input file',
131
- required=False)
127
+ help='Default: Same as input file', required=False)
132
128
 
133
129
  misc = parser.add_argument_group("Misc Parameters")
134
130
  misc.add_argument("-verbose", action="store_true", dest="verbose",
135
- help="Print verbose output.",
136
- required=False)
131
+ help="Print verbose output.", required=False)
137
132
  misc.add_argument("-v", "--version", action="version",
138
133
  version=f"PyamilySeq: Group-Summary version {PyamilySeq_Version} - Exiting",
139
134
  help="Print out version number and exit")
140
135
 
141
-
142
136
  options = parser.parse_args()
143
- print("Running PyamilySeq " + PyamilySeq_Version+ ": Group-Summary ")
137
+ print("Running PyamilySeq " + PyamilySeq_Version + ": Group-Summary ")
144
138
 
145
- ### File handling
146
- options.input_clstr = fix_path(options.input_clstr)
139
+ # File handling
140
+ options.input_cluster = fix_path(options.input_cluster)
147
141
  if options.output_dir is None:
148
- options.output_dir = os.path.dirname(os.path.abspath(options.input_clstr))
142
+ options.output_dir = os.path.dirname(os.path.abspath(options.input_cluster))
149
143
  output_path = os.path.abspath(options.output_dir)
150
144
  if not os.path.exists(output_path):
151
145
  os.makedirs(output_path)
146
+
152
147
  output_name = options.output
153
148
  if not output_name.endswith('.tsv'):
154
149
  output_name += '.tsv'
155
150
  output_file_path = os.path.join(output_path, output_name)
156
- ###
157
151
 
158
- clusters = read_cd_hit_output(options.input_clstr)
159
- summarise_clusters(options,clusters, output_file_path)
152
+ # Process clusters and generate summary
153
+ clusters = read_cd_hit_output(options.input_cluster)
154
+ summarise_clusters(options, clusters, output_file_path)
160
155
 
161
156
 
162
157
  if __name__ == "__main__":
PyamilySeq/PyamilySeq.py CHANGED
@@ -1,5 +1,5 @@
1
1
  import argparse
2
-
2
+ #from config import config_params
3
3
 
4
4
  try:
5
5
  from .PyamilySeq_Species import cluster as species_cluster
@@ -20,8 +20,8 @@ def run_cd_hit(options, input_file, clustering_output, clustering_mode):
20
20
  clustering_mode,
21
21
  '-i', input_file,
22
22
  '-o', clustering_output,
23
- '-c', str(options.pident),
24
- '-s', str(options.len_diff),
23
+ '-c', f"{float(options.pident):.2f}",
24
+ '-s', f"{float(options.len_diff):.2f}",
25
25
  '-T', str(options.threads),
26
26
  '-M', str(options.mem),
27
27
  '-d', "0",
@@ -62,11 +62,12 @@ def main():
62
62
  help="Clustering mode: 'DNA' or 'AA'.")
63
63
  full_parser.add_argument("-gene_ident", default="CDS", required=False,
64
64
  help="Gene identifiers to extract sequences (e.g., 'CDS, tRNA').")
65
- full_parser.add_argument("-c", type=float, dest="pident", default=0.90, required=False,
65
+ full_parser.add_argument("-c", type=str, dest="pident", default="0.90", required=False,
66
66
  help="Sequence identity threshold for clustering (default: 0.90) - CD-HIT parameter '-c'.")
67
- full_parser.add_argument("-s", type=float, dest="len_diff", default=0.80, required=False,
67
+ full_parser.add_argument("-s", type=str, dest="len_diff", default="0.80", required=False,
68
68
  help="Length difference threshold for clustering (default: 0.80) - CD-HIT parameter '-s'.")
69
- full_parser.add_argument("-fast_mode", action="store_true", required=False,
69
+
70
+ full_parser.add_argument("-fast_mode", action="store_true",
70
71
  help="Enable fast mode for CD-HIT (not recommended) - CD-HIT parameter '-g'.")
71
72
 
72
73
 
@@ -94,14 +95,14 @@ def main():
94
95
  subparser.add_argument("-genus_groups", default="1,2,3,4,5,6,7,8,9,10", required=False,
95
96
  help="Gene groupings for 'Genus' mode (default: '1-10').")
96
97
  subparser.add_argument("-write_groups", default=None, dest="write_groups", required=False,
97
- help="Output gene groups as a single FASTA file (specify levels: e.g., '-w 99,95'). - triggers '-wig'.")
98
- subparser.add_argument("-write_individual_groups", action="store_true", dest="write_individual_groups", required=False,
98
+ help="Output gene groups as a single FASTA file (e.g., '99,95'). Triggers writing individual groups.")
99
+ subparser.add_argument("-write_individual_groups", action="store_true", dest="write_individual_groups",
99
100
  help="Output individual FASTA files for each group.")
100
- subparser.add_argument("-align", action="store_true", dest="align_core", required=False,
101
- help="Align and concatenate sequences for 'core' groups specified with '-w'.")
102
- subparser.add_argument("-align_aa", action="store_true", required=False,
101
+ subparser.add_argument("-align", action="store_true", dest="align_core",
102
+ help="Align and concatenate sequences for 'core' groups (those in 99-100%% of genomes).")
103
+ subparser.add_argument("-align_aa", action="store_true",
103
104
  help="Align sequences as amino acids.")
104
- subparser.add_argument("-no_gpa", action="store_false", dest="gene_presence_absence_out", required=False,
105
+ subparser.add_argument("-no_gpa", action="store_false", dest="gene_presence_absence_out",
105
106
  help="Skip creation of gene_presence_absence.csv.")
106
107
  subparser.add_argument("-M", type=int, default=4000, dest="mem", required=False,
107
108
  help="Memory allocation for clustering (MB) - CD-HIT parameter '-M'.")
@@ -109,13 +110,15 @@ def main():
109
110
  help="Number of threads for clustering/alignment - CD-HIT parameter '-T' | MAFFT parameter '--thread'.")
110
111
 
111
112
  # Miscellaneous Arguments
112
- subparser.add_argument("-verbose", action="store_true", required=False,
113
+ subparser.add_argument("-verbose", action="store_true",
113
114
  help="Print verbose output.")
114
115
  subparser.add_argument("-v", "--version", action="version",
115
- version=f"PyamilySeq {PyamilySeq_Version}: Exiting.", help="Print version number and exit.")
116
+ version=f"PyamilySeq {PyamilySeq_Version}: Exiting.")
116
117
 
117
118
  # Parse Arguments
118
119
  options = parser.parse_args()
120
+ ## Configuration
121
+
119
122
 
120
123
  if options.write_groups != None and options.write_individual_groups == False:
121
124
  options.write_individual_groups = True
@@ -146,7 +149,7 @@ def main():
146
149
  if options.align_core:
147
150
  options.write_individual_groups = True
148
151
  if options.write_groups == None:
149
- sys.exit('Must provide "-w" to output gene groups before alignment "-a" can be done.')
152
+ sys.exit('Must provide "-write_groups" to output gene groups before alignment "-align" can be done.')
150
153
  elif options.run_mode == 'Partial':
151
154
  required_partial_mode = [options.cluster_file, options.original_fasta]
152
155
  if all(required_partial_mode):
@@ -187,13 +190,13 @@ def main():
187
190
  elif options.sequence_type == 'AA':
188
191
  clustering_mode = 'cd-hit'
189
192
  if options.fast_mode == True:
190
- options.fast_mode = 0
193
+ options.fast_mode = 1
191
194
  if options.verbose == True:
192
195
  print("Running CD-HIT in fast mode.")
193
196
  else:
194
- options.fast_mode = 1
197
+ options.fast_mode = 0
195
198
  if options.verbose == True:
196
- print("Running CD-HIT in slow mode.")
199
+ print("Running CD-HIT in accurate mode.")
197
200
  else:
198
201
  exit("cd-hit is not installed. Please install cd-hit to proceed.")
199
202
 
@@ -239,10 +242,10 @@ def main():
239
242
  translate = False
240
243
  file_to_cluster = combined_out_file
241
244
  if options.input_type == 'separate':
242
- read_separate_files(options.input_dir, options.name_split_gff, options.name_split_fasta, options.gene_ident, combined_out_file, translate)
245
+ read_separate_files(options.input_dir, options.name_split_gff, options.name_split_fasta, options.gene_ident, combined_out_file, translate, False)
243
246
  run_cd_hit(options, file_to_cluster, clustering_output, clustering_mode)
244
247
  elif options.input_type == 'combined':
245
- read_combined_files(options.input_dir, options.name_split_gff, options.gene_ident, combined_out_file, translate)
248
+ read_combined_files(options.input_dir, options.name_split_gff, options.gene_ident, combined_out_file, translate, False)
246
249
  run_cd_hit(options, file_to_cluster, clustering_output, clustering_mode)
247
250
  elif options.input_type == 'fasta':
248
251
  combined_out_file = options.input_fasta
@@ -281,6 +284,8 @@ def main():
281
284
  clustering_options = clustering_options()
282
285
 
283
286
  elif options.run_mode == 'Partial':
287
+ if not os.path.exists(output_path):
288
+ os.makedirs(output_path)
284
289
  class clustering_options:
285
290
  def __init__(self):
286
291
  self.run_mode = options.run_mode
@@ -17,7 +17,8 @@ def gene_presence_absence_output(options, genus_dict, pangenome_clusters_First_s
17
17
  #in_name = options.clusters.split('.')[0].split('/')[-1]
18
18
  gpa_outfile = os.path.join(output_dir, 'gene_presence_absence.csv')
19
19
  gpa_outfile = open(gpa_outfile, 'w')
20
- gpa_outfile.write('"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","'
20
+ genus_dict = OrderedDict(sorted(genus_dict.items()))
21
+ gpa_outfile.write('"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment",'
21
22
  '"Accessory Fragment","Accessory Order with Fragment","QC","Min group size nuc","Max group size nuc","Avg group size nuc","')
22
23
  gpa_outfile.write('","'.join(genus_dict.keys()))
23
24
  gpa_outfile.write('"\n')
@@ -15,14 +15,15 @@ def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_
15
15
  #in_name = options.clusters.split('.')[0].split('/')[-1]
16
16
  gpa_outfile = os.path.join(output_dir, 'gene_presence_absence.csv')
17
17
  gpa_outfile = open(gpa_outfile, 'w')
18
- gpa_outfile.write('"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment","'
18
+ genome_dict = OrderedDict(sorted(genome_dict.items()))
19
+ gpa_outfile.write('"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment",'
19
20
  '"Accessory Fragment","Accessory Order with Fragment","QC","Min group size nuc","Max group size nuc","Avg group size nuc","')
20
21
  gpa_outfile.write('","'.join(genome_dict.keys()))
21
22
  gpa_outfile.write('"\n')
22
23
  for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
23
24
  average_sequences_per_genome = len(sequences) / len(pangenome_clusters_First_sorted[cluster])
24
25
  gpa_outfile.write('"group_'+str(cluster)+'","","","'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
25
- '","","","","","","","","",""')
26
+ '","","","","","","","",""')
26
27
 
27
28
 
28
29
  for genome in genome_dict.keys():
@@ -120,12 +121,14 @@ def calc_Second_only_core(cluster, Second_num, groups, cores):
120
121
 
121
122
  #@profile
122
123
  def calc_only_Second_only_core(cluster, Second_num, groups, cores): # only count the true storf onlies
123
- groups_as_list = list(groups.values())
124
- for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= Second_num <= fir):
125
- res = idx
126
- family_group = list(groups)[res]
127
- cores['only_Second_core_' + family_group].append(cluster)
128
-
124
+ try:
125
+ groups_as_list = list(groups.values())
126
+ for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= Second_num <= fir):
127
+ res = idx
128
+ family_group = list(groups)[res]
129
+ cores['only_Second_core_' + family_group].append(cluster)
130
+ except UnboundLocalError:
131
+ sys.exit("Error in calc_only_Second_only_core")
129
132
 
130
133
 
131
134
  #@profile
@@ -65,17 +65,17 @@ def main():
65
65
  if not os.path.exists(output_path):
66
66
  os.makedirs(output_path)
67
67
 
68
- output_file = options.output_file + '.fasta'
69
- if os.path.exists(os.path.join(output_path, output_file)):
70
- print(f"Output file {output_file} already exists in the output directory. Please delete or rename the file and try again.")
68
+ #output_file = options.output_file + '.fasta'
69
+ if os.path.exists(os.path.join(output_path, options.output_file)):
70
+ print(f"Output file {options.output_file} already exists in the output directory. Please delete or rename the file and try again.")
71
71
  exit(1)
72
72
 
73
- combined_out_file = os.path.join(output_path, output_file )
73
+ combined_out_file = os.path.join(output_path, options.output_file )
74
74
 
75
75
  if options.input_type == 'separate':
76
- read_separate_files(options.input_dir, options.name_split_gff, options.name_split_fasta, options.gene_ident, combined_out_file, options.translate)
76
+ read_separate_files(options.input_dir, options.name_split_gff, options.name_split_fasta, options.gene_ident, combined_out_file, options.translate, True)
77
77
  elif options.input_type == 'combined':
78
- read_combined_files(options.input_dir, options.name_split_gff, options.gene_ident, combined_out_file, options.translate)
78
+ read_combined_files(options.input_dir, options.name_split_gff, options.gene_ident, combined_out_file, options.translate, True)
79
79
  elif options.input_type == 'fasta':
80
80
  read_fasta_files(options.input_dir, options.name_split_fasta, combined_out_file, options.translate)
81
81
 
PyamilySeq/clusterings.py CHANGED
@@ -279,8 +279,6 @@ def combined_clustering_CDHIT(options, taxa_dict, splitter):
279
279
  first = True
280
280
  for line in Second_in:
281
281
  if line.startswith('>'):
282
- if '>Cluster 1997' in line:
283
- print()
284
282
  if first == False:
285
283
  cluster_size = len(Combined_clusters[cluster_id])
286
284
  Combined_reps.update({rep: cluster_size})
PyamilySeq/config.py ADDED
File without changes
PyamilySeq/constants.py CHANGED
@@ -1,2 +1,2 @@
1
- PyamilySeq_Version = 'v1.1.0'
1
+ PyamilySeq_Version = 'v1.1.2'
2
2
 
PyamilySeq/utils.py CHANGED
@@ -7,6 +7,7 @@ from tempfile import NamedTemporaryFile
7
7
  import sys
8
8
  import re
9
9
  import math
10
+ #from config import config_params
10
11
 
11
12
  ####
12
13
  # Placeholder for the distance function
@@ -18,6 +19,7 @@ try:
18
19
  def levenshtein_distance_calc(seq1, seq2):
19
20
  return LV.distance(seq1, seq2)
20
21
  except (ModuleNotFoundError, ImportError):
22
+ #if config_params.verbose == True: - Not implemented yet
21
23
  print("Levenshtein package not installed - Will fallback to slower Python implementation.")
22
24
  # Fallback implementation
23
25
  def levenshtein_distance_calc(seq1, seq2):
@@ -228,9 +230,15 @@ def run_mafft_on_sequences(options, sequences, output_file):
228
230
 
229
231
 
230
232
 
231
- def read_separate_files(input_dir, name_split_gff, name_split_fasta, gene_ident, combined_out, translate):
232
- paired_files_found = None
233
- with open(combined_out, 'w') as combined_out_file, open(combined_out.replace('_dna.fasta','_aa.fasta'), 'w') as combined_out_file_aa:
233
+ def read_separate_files(input_dir, name_split_gff, name_split_fasta, gene_ident, combined_out, translate, run_as_combiner):
234
+ if run_as_combiner == True:
235
+ combined_out_file_aa = None
236
+ else:
237
+ combined_out_file_aa = combined_out.replace('_dna.fasta','_aa.fasta')
238
+
239
+ with open(combined_out, 'w') as combined_out_file, (open(combined_out_file_aa, 'w') if combined_out_file_aa else open(os.devnull, 'w')) as combined_out_file_aa:
240
+ paired_files_found = None
241
+ #with open(combined_out, 'w') as combined_out_file, open(combined_out.replace('_dna.fasta','_aa.fasta'), 'w') as combined_out_file_aa:
234
242
  gff_files = glob.glob(os.path.join(input_dir, '*' + name_split_gff))
235
243
  if not gff_files:
236
244
  sys.exit("Error: No GFF files found.")
@@ -299,23 +307,40 @@ def read_separate_files(input_dir, name_split_gff, name_split_fasta, gene_ident,
299
307
  full_sequence = fasta_dict[contig][1]
300
308
  seq = full_sequence[corrected_start:corrected_stop]
301
309
 
302
- if translate == True:
303
- seq_aa = translate_frame(seq)
304
- wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
305
- combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
306
- wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
307
- combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
310
+ if run_as_combiner == True:
311
+ if translate == True:
312
+ seq_aa = translate_frame(seq)
313
+ wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
314
+ combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
315
+ else:
316
+ wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
317
+ combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
318
+ else:
319
+ if translate == True:
320
+ seq_aa = translate_frame(seq)
321
+ wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
322
+ combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
323
+ wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
324
+ combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
308
325
 
309
326
  if not paired_files_found:
310
327
  sys.exit("Could not find matching GFF/FASTA files - Please check input directory and -name_split_gff and -name_split_fasta parameters.")
311
328
  if translate == False or translate == None:
312
329
  #Clean up unused file
313
- if combined_out_file.name != combined_out_file_aa.name:
314
- os.remove(combined_out_file_aa.name)
330
+ try: # Catches is combined_out_file_aa is None
331
+ if combined_out_file.name != combined_out_file_aa.name:
332
+ os.remove(combined_out_file_aa.name)
333
+ except AttributeError:
334
+ pass
315
335
 
316
336
 
317
- def read_combined_files(input_dir, name_split, gene_ident, combined_out, translate):
318
- with open(combined_out, 'w') as combined_out_file, open(combined_out.replace('_dna.fasta','_aa.fasta'), 'w') as combined_out_file_aa:
337
+ def read_combined_files(input_dir, name_split, gene_ident, combined_out, translate, run_as_combiner):
338
+ if run_as_combiner == True:
339
+ combined_out_file_aa = None
340
+ else:
341
+ combined_out_file_aa = combined_out.replace('_dna.fasta','_aa.fasta')
342
+ #with open(combined_out, 'w') as combined_out_file, open(combined_out_file_aa, 'w') if combined_out_file_aa else open(os.devnull, 'w'):
343
+ with open(combined_out, 'w') as combined_out_file, (open(combined_out_file_aa, 'w') if combined_out_file_aa else open(os.devnull, 'w')) as combined_out_file_aa:
319
344
  gff_files = glob.glob(os.path.join(input_dir, '*' + name_split))
320
345
  if not gff_files:
321
346
  sys.exit("Error: No GFF files found - check input directory and -name_split_gff parameter.")
@@ -355,7 +380,7 @@ def read_combined_files(input_dir, name_split, gene_ident, combined_out, transla
355
380
 
356
381
  for contig, fasta in fasta_dict.items():
357
382
  reverse_sequence = reverse_complement(fasta[0])
358
- fasta_dict[contig][1]=reverse_sequence
383
+ fasta_dict[contig][1] = reverse_sequence
359
384
 
360
385
  if fasta_dict and gff_features:
361
386
  for contig, start, end, strand, feature, seq_id in gff_features:
@@ -369,22 +394,38 @@ def read_combined_files(input_dir, name_split, gene_ident, combined_out, transla
369
394
  full_sequence = fasta_dict[contig][1]
370
395
  seq = full_sequence[corrected_start:corrected_stop]
371
396
 
372
- if translate == True:
373
- seq_aa = translate_frame(seq)
374
- wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
375
- combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
376
- wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
377
- combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
397
+ if run_as_combiner == True:
398
+ if translate == True:
399
+ seq_aa = translate_frame(seq)
400
+ wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
401
+ combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
402
+ else:
403
+ wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
404
+ combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
405
+ else:
406
+ if translate == True:
407
+ seq_aa = translate_frame(seq)
408
+ wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
409
+ combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
410
+ wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
411
+ combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
378
412
 
379
413
  if translate == False or translate == None:
380
414
  #Clean up unused file
381
- if combined_out_file.name != combined_out_file_aa.name:
382
- os.remove(combined_out_file_aa.name)
415
+ try: # Catches is combined_out_file_aa is None
416
+ if combined_out_file.name != combined_out_file_aa.name:
417
+ os.remove(combined_out_file_aa.name)
418
+ except AttributeError:
419
+ pass
383
420
 
384
421
 
385
422
 
386
- def read_fasta_files(input_dir, name_split_fasta, combined_out, translate):
387
- with open(combined_out, 'w') as combined_out_file, open(combined_out.replace('_dna.fasta','_aa.fasta'), 'w') as combined_out_file_aa:
423
+ def read_fasta_files(input_dir, name_split_fasta, combined_out, translate, run_as_combiner):
424
+ if run_as_combiner == True:
425
+ combined_out_file_aa = None
426
+ else:
427
+ combined_out_file_aa = combined_out.replace('_dna.fasta','_aa.fasta')
428
+ with open(combined_out, 'w') as combined_out_file, (open(combined_out_file_aa, 'w') if combined_out_file_aa else open(os.devnull, 'w')) as combined_out_file_aa:
388
429
  fasta_files = glob.glob(os.path.join(input_dir, '*' + name_split_fasta))
389
430
  if not fasta_files:
390
431
  sys.exit("Error: No GFF files found.")
@@ -400,17 +441,30 @@ def read_fasta_files(input_dir, name_split_fasta, combined_out, translate):
400
441
  else:
401
442
  fasta_dict[current_seq] +=line.strip()
402
443
  for seq_id, seq in fasta_dict.items():
403
- if translate == True:
404
- seq_aa = translate_frame(seq)
405
- wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
406
- combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
407
- wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
408
- combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
444
+ if run_as_combiner == True:
445
+ if translate == True:
446
+ seq_aa = translate_frame(seq)
447
+ wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
448
+ combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
449
+ else:
450
+ wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
451
+ combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
452
+ else:
453
+ if translate == True:
454
+ seq_aa = translate_frame(seq)
455
+ wrapped_sequence_aa = '\n'.join([seq_aa[i:i + 60] for i in range(0, len(seq_aa), 60)])
456
+ combined_out_file_aa.write(f">{genome_name}|{seq_id}\n{wrapped_sequence_aa}\n")
457
+ wrapped_sequence = '\n'.join([seq[i:i + 60] for i in range(0, len(seq), 60)])
458
+ combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
409
459
 
410
460
  if translate == False or translate == None:
411
461
  #Clean up unused file
412
- if combined_out_file.name != combined_out_file_aa.name:
413
- os.remove(combined_out_file_aa.name)
462
+ try: # Catches is combined_out_file_aa is None
463
+ if combined_out_file.name != combined_out_file_aa.name:
464
+ os.remove(combined_out_file_aa.name)
465
+ except AttributeError:
466
+ pass
467
+
414
468
 
415
469
  def write_groups_func(options, output_dir, key_order, cores, sequences,
416
470
  pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences):
@@ -533,7 +587,8 @@ def write_groups_func(options, output_dir, key_order, cores, sequences,
533
587
  def perform_alignment(gene_path,group_directory, gene_file, options, concatenated_sequences, subgrouped):
534
588
  # Read sequences from the gene family file
535
589
  sequences = read_fasta(gene_path)
536
-
590
+ if len(sequences) == 1: # We can't align a single sequence
591
+ return concatenated_sequences
537
592
  # Select the longest sequence for each genome
538
593
  longest_sequences = select_longest_gene(sequences, subgrouped)
539
594
 
@@ -574,20 +629,18 @@ def process_gene_groups(options, group_directory, sub_group_directory, paralog_g
574
629
  # Iterate over each gene family file
575
630
  for gene_file in os.listdir(group_directory):
576
631
  if gene_file.endswith(affix) and not gene_file.startswith('combined_group_sequences'):
577
- #print(gene_file)
578
632
  current_group = int(gene_file.split('_')[3].split('.')[0])
579
633
  gene_path = os.path.join(group_directory, gene_file)
580
-
581
- # Check for matching group in paralog_groups
582
- if sub_group_directory and paralog_groups and '>Group_'+str(current_group) in paralog_groups:
583
- for subgroup, size in enumerate(paralog_groups['>Group_' + str(current_group)]['sizes']):
584
- if size >= threshold_size:
585
- gene_path = os.path.join(sub_group_directory,f"Group_{current_group}_subgroup_{subgroup}{affix}")
586
- concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, True)
587
-
588
- else:
589
- concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, False)
590
-
634
+ # Could add more catches here to work with First and Secondary groups - This ensures only core '99/100' are aligned
635
+ if 'First_core_99' in gene_file or 'First_core_100' in gene_file:
636
+ # Check for matching group in paralog_groups
637
+ if sub_group_directory and paralog_groups and '>Group_'+str(current_group) in paralog_groups:
638
+ for subgroup, size in enumerate(paralog_groups['>Group_' + str(current_group)]['sizes']):
639
+ if size >= threshold_size:
640
+ gene_path = os.path.join(sub_group_directory,f"Group_{current_group}_subgroup_{subgroup}{affix}")
641
+ concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, True)
642
+ else:
643
+ concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, False)
591
644
 
592
645
  # Write the concatenated sequences to the output file
593
646
  with open(output_file, 'w') as out:
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: PyamilySeq
3
- Version: 1.1.0
3
+ Version: 1.1.2
4
4
  Summary: PyamilySeq - A a tool to investigate sequence-based gene groups identified by clustering methods such as CD-HIT, DIAMOND, BLAST or MMseqs2.
5
5
  Home-page: https://github.com/NickJD/PyamilySeq
6
6
  Author: Nicholas Dimonaco
@@ -13,6 +13,7 @@ Requires-Python: >=3.6
13
13
  Description-Content-Type: text/markdown
14
14
  License-File: LICENSE
15
15
  Requires-Dist: levenshtein
16
+ Dynamic: license-file
16
17
 
17
18
  # PyamilySeq
18
19
  **PyamilySeq** is a Python tool for clustering gene sequences into groups based on sequence similarity identified by tools such as CD-HIT, BLAST, DIAMOND or MMseqs2.
@@ -45,7 +46,7 @@ To update to the newest version add '-U' to end of the pip install command.
45
46
  ```commandline
46
47
  usage: PyamilySeq.py [-h] {Full,Partial} ...
47
48
 
48
- PyamilySeq v1.1.0: A tool for gene clustering and analysis.
49
+ PyamilySeq v1.1.2: A tool for gene clustering and analysis.
49
50
 
50
51
  positional arguments:
51
52
  {Full,Partial} Choose a mode: 'Full' or 'Partial'.
@@ -75,7 +76,7 @@ Escherichia_coli_110957|ENSB_TIZS9kbTvShDvyX Escherichia_coli_110957|ENSB_TIZS9k
75
76
  ```
76
77
  ### Example output:
77
78
  ```
78
- Running PyamilySeq v1.1.0
79
+ Running PyamilySeq v1.1.2
79
80
  Calculating Groups
80
81
  Number of Genomes: 10
81
82
  Gene Groups
@@ -220,7 +221,7 @@ Seq-Combiner -input_dir .../test_data/genomes -name_split_gff .gff3 -output_dir
220
221
  ```
221
222
  usage: Seq_Combiner.py [-h] -input_dir INPUT_DIR -input_type {separate,combined,fasta} [-name_split_gff NAME_SPLIT_GFF] [-name_split_fasta NAME_SPLIT_FASTA] -output_dir OUTPUT_DIR -output_name OUTPUT_FILE [-gene_ident GENE_IDENT] [-translate] [-v]
222
223
 
223
- PyamilySeq v1.1.0: Seq-Combiner - A tool to extract sequences from GFF/FASTA files and prepare them for PyamilySeq.
224
+ PyamilySeq v1.1.2: Seq-Combiner - A tool to extract sequences from GFF/FASTA files and prepare them for PyamilySeq.
224
225
 
225
226
  options:
226
227
  -h, --help show this help message and exit
@@ -263,7 +264,7 @@ usage: Group_Splitter.py [-h] -input_fasta INPUT_FASTA -sequence_type {AA,DNA}
263
264
  [-M CLUSTERING_MEMORY] [-no_delete_temp_files]
264
265
  [-verbose] [-v]
265
266
 
266
- PyamilySeq v1.1.0: Group-Splitter - A tool to split multi-copy gene groups
267
+ PyamilySeq v1.1.2: Group-Splitter - A tool to split multi-copy gene groups
267
268
  identified by PyamilySeq.
268
269
 
269
270
  options:
@@ -316,7 +317,7 @@ Cluster-Summary -genome_num 10 -input_clstr .../test_data/species/E-coli/E-coli_
316
317
  usage: Cluster_Summary.py [-h] -input_clstr INPUT_CLSTR -output OUTPUT -genome_num GENOME_NUM
317
318
  [-output_dir OUTPUT_DIR] [-verbose] [-v]
318
319
 
319
- PyamilySeq v1.1.0: Cluster-Summary - A tool to summarise CD-HIT clustering files.
320
+ PyamilySeq v1.1.2: Cluster-Summary - A tool to summarise CD-HIT clustering files.
320
321
 
321
322
  options:
322
323
  -h, --help show this help message and exit
@@ -0,0 +1,22 @@
1
+ PyamilySeq/Cluster_Compare.py,sha256=2jRXBYN8T9TUDLV9bj3SWFQ2pBUH3BAKW1FYrDYSQBw,4421
2
+ PyamilySeq/Cluster_Summary.py,sha256=efXMfGvATERCTxwaqbauhZwt_5Hrf9KpGKY3EgsHVDk,6720
3
+ PyamilySeq/Group_Extractor.py,sha256=oe2VmOVxdvTmAcy8NKwD1F27IdN2utAfczxsyxg96yc,2898
4
+ PyamilySeq/Group_Sizes.py,sha256=3snkAN19o3Y4IY6IqSim1qy415FfQe1Wb8vzWTKF0Wo,3028
5
+ PyamilySeq/Group_Splitter.py,sha256=OcMj9GnAyybs_DaNKRyvfL_nl2dB2gUI4BD_EQrBbWo,25653
6
+ PyamilySeq/PyamilySeq.py,sha256=tdmIDB2ZYCRfMFQSuWrN0Psr5ggSaoUcT2wEv54jWos,17462
7
+ PyamilySeq/PyamilySeq_Genus.py,sha256=KUC0QkCRpKQ9HEgxyTSD7Nc63wSXtriWyIqt_YOy5ys,12470
8
+ PyamilySeq/PyamilySeq_Species.py,sha256=gJy8Pn82Za44l6y9tg7bWJri2k_0OwZiplANIEH2o-c,16289
9
+ PyamilySeq/Seq_Combiner.py,sha256=3iJy7LNp7uBa3sU1F5bmov1ghvbcviOYqgkhbrbV1QQ,4737
10
+ PyamilySeq/Seq_Extractor.py,sha256=KMR0KcTJzrh99HcBN4qb76R2FuBvpYCDf4NwkmwhTPU,2870
11
+ PyamilySeq/Seq_Finder.py,sha256=ht-fSQ_opWKydcoWI9D3nTwLt6Rpgevnf2y0KxVjw4M,1881
12
+ PyamilySeq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
+ PyamilySeq/clusterings.py,sha256=9t9Q7IYb9x9gXxcv_FxsWqgdMQ-MYa-5OpkBzpgbrXc,22291
14
+ PyamilySeq/config.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ PyamilySeq/constants.py,sha256=WVns7PIMu89mNbb_lhu_Hf8fcX4AiUKiMKWAnwEHBvM,31
16
+ PyamilySeq/utils.py,sha256=aebXIUWIXsL3Zb47ONYqVoF1X504lJ4amewhpO1hNWE,33067
17
+ pyamilyseq-1.1.2.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
18
+ pyamilyseq-1.1.2.dist-info/METADATA,sha256=YlUvYX1GX0Acoh2V28jq0aMC-reFzEwoUWre8W2eK54,17979
19
+ pyamilyseq-1.1.2.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
20
+ pyamilyseq-1.1.2.dist-info/entry_points.txt,sha256=mFq5TNzPI_B9vDRGEaT9pNPRGWFAgf_SE3R-dDNf1pM,662
21
+ pyamilyseq-1.1.2.dist-info/top_level.txt,sha256=J6JhugUQTq4rq96yibAlQu3o4KCM9WuYfqr3w1r119M,11
22
+ pyamilyseq-1.1.2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.1)
2
+ Generator: setuptools (80.3.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,10 +1,12 @@
1
1
  [console_scripts]
2
+ Cluster-Extractor = PyamilySeq.Cluster_Extractor:main
2
3
  Cluster-Summary = PyamilySeq.Cluster_Summary:main
3
4
  Group-Splitter = PyamilySeq.Group_Splitter:main
4
5
  PyamilySeq = PyamilySeq.PyamilySeq:main
5
6
  Seq-Combiner = PyamilySeq.Seq_Combiner:main
6
7
  Seq-Extractor = PyamilySeq.Seq_Extractor:main
7
8
  Seq-Finder = PyamilySeq.Seq_Finder:main
9
+ cluster-extractor = PyamilySeq.Cluster_Extractor:main
8
10
  cluster-summary = PyamilySeq.Cluster_Summary:main
9
11
  group-splitter = PyamilySeq.Group_Splitter:main
10
12
  pyamilyseq = PyamilySeq.PyamilySeq:main
@@ -1,20 +0,0 @@
1
- PyamilySeq/Cluster_Summary.py,sha256=xkzkC9mHRqHNm2bp0gcBWOobKTabltBrVv6ze4nikyg,6982
2
- PyamilySeq/Group_Extractor.py,sha256=oe2VmOVxdvTmAcy8NKwD1F27IdN2utAfczxsyxg96yc,2898
3
- PyamilySeq/Group_Sizes.py,sha256=3snkAN19o3Y4IY6IqSim1qy415FfQe1Wb8vzWTKF0Wo,3028
4
- PyamilySeq/Group_Splitter.py,sha256=OcMj9GnAyybs_DaNKRyvfL_nl2dB2gUI4BD_EQrBbWo,25653
5
- PyamilySeq/PyamilySeq.py,sha256=iyLU8YYn06ppCBCb-HaARm0ez_uagkj4FRtqr3pFu9Q,17396
6
- PyamilySeq/PyamilySeq_Genus.py,sha256=mBZNTWWk4a_5gsHzq0cG82Qck-SL9Xgg6jpByH6oKuk,12414
7
- PyamilySeq/PyamilySeq_Species.py,sha256=yyMHBQhTq1yFUKbiba6vL9nYJ4rlKsmS30hQNimGEB8,16120
8
- PyamilySeq/Seq_Combiner.py,sha256=wusCDrGwcVqQw3h5iPSY0_E5sauRFNHj0bXMBtELpl0,4700
9
- PyamilySeq/Seq_Extractor.py,sha256=KMR0KcTJzrh99HcBN4qb76R2FuBvpYCDf4NwkmwhTPU,2870
10
- PyamilySeq/Seq_Finder.py,sha256=ht-fSQ_opWKydcoWI9D3nTwLt6Rpgevnf2y0KxVjw4M,1881
11
- PyamilySeq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
- PyamilySeq/clusterings.py,sha256=9fXnFZSypeZKtJK8OYONQKVpvLoeJCSxx2BRo5Vir38,22355
13
- PyamilySeq/constants.py,sha256=7zjA69Blx4ycagDKCvaKdVuXidVVwu7adnmgm53_CTI,31
14
- PyamilySeq/utils.py,sha256=EwZQP5eFaXiXVGw0Vckhu9tJkm8SOKAgGh7-JnIu_j4,29355
15
- pyamilyseq-1.1.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
16
- pyamilyseq-1.1.0.dist-info/METADATA,sha256=f8hZWEdD06WYoflfPls02q_6FVFYL18np1BGzWXwlGA,17957
17
- pyamilyseq-1.1.0.dist-info/WHEEL,sha256=nn6H5-ilmfVryoAQl3ZQ2l8SH5imPWFpm1A5FgEuFV4,91
18
- pyamilyseq-1.1.0.dist-info/entry_points.txt,sha256=Qao6g8F37k35MQFkUpGt9xoozRBaTkIKUptXWAUs5-E,554
19
- pyamilyseq-1.1.0.dist-info/top_level.txt,sha256=J6JhugUQTq4rq96yibAlQu3o4KCM9WuYfqr3w1r119M,11
20
- pyamilyseq-1.1.0.dist-info/RECORD,,