PyamilySeq 1.3.1__py3-none-any.whl → 1.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyamilySeq/{Cluster_Compare.py → Group_Compare.py} +27 -13
- PyamilySeq/Group_Extractor.py +29 -12
- PyamilySeq/Group_Sizes.py +22 -8
- PyamilySeq/Group_Splitter.py +89 -29
- PyamilySeq/{Cluster_Summary.py → Group_Summary.py} +18 -20
- PyamilySeq/PyamilySeq.py +66 -43
- PyamilySeq/PyamilySeq_Genus.py +1 -1
- PyamilySeq/PyamilySeq_Species.py +30 -63
- PyamilySeq/Seq_Combiner.py +125 -15
- PyamilySeq/Seq_Extractor.py +24 -2
- PyamilySeq/Seq_Finder.py +20 -2
- PyamilySeq/clusterings.py +1 -1
- PyamilySeq/constants.py +142 -1
- PyamilySeq/utils.py +171 -84
- {pyamilyseq-1.3.1.dist-info → pyamilyseq-1.3.3.dist-info}/METADATA +14 -14
- pyamilyseq-1.3.3.dist-info/RECORD +21 -0
- {pyamilyseq-1.3.1.dist-info → pyamilyseq-1.3.3.dist-info}/WHEEL +1 -1
- {pyamilyseq-1.3.1.dist-info → pyamilyseq-1.3.3.dist-info}/entry_points.txt +4 -4
- PyamilySeq/config.py +0 -0
- pyamilyseq-1.3.1.dist-info/RECORD +0 -22
- {pyamilyseq-1.3.1.dist-info → pyamilyseq-1.3.3.dist-info}/licenses/LICENSE +0 -0
- {pyamilyseq-1.3.1.dist-info → pyamilyseq-1.3.3.dist-info}/top_level.txt +0 -0
PyamilySeq/PyamilySeq.py
CHANGED
|
@@ -1,6 +1,3 @@
|
|
|
1
|
-
import argparse
|
|
2
|
-
#from config import config_params
|
|
3
|
-
|
|
4
1
|
try:
|
|
5
2
|
from .PyamilySeq_Species import cluster as species_cluster
|
|
6
3
|
from .PyamilySeq_Genus import cluster as genus_cluster
|
|
@@ -12,10 +9,11 @@ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
|
|
|
12
9
|
from constants import *
|
|
13
10
|
from utils import *
|
|
14
11
|
|
|
15
|
-
|
|
16
|
-
|
|
12
|
+
import traceback
|
|
13
|
+
import sys
|
|
17
14
|
|
|
18
15
|
def run_cd_hit(options, input_file, clustering_output, clustering_mode):
|
|
16
|
+
logger = logging.getLogger("PyamilySeq.PyamilySeq")
|
|
19
17
|
cdhit_command = [
|
|
20
18
|
clustering_mode,
|
|
21
19
|
'-i', input_file,
|
|
@@ -29,14 +27,25 @@ def run_cd_hit(options, input_file, clustering_output, clustering_mode):
|
|
|
29
27
|
'-sc', "1",
|
|
30
28
|
'-sf', "1"
|
|
31
29
|
]
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
30
|
+
logger.debug("CD-HIT command: %s", " ".join(cdhit_command))
|
|
31
|
+
try:
|
|
32
|
+
if options.verbose:
|
|
33
|
+
ret = subprocess.run(cdhit_command)
|
|
34
|
+
else:
|
|
35
|
+
ret = subprocess.run(cdhit_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
36
|
+
if ret.returncode != 0:
|
|
37
|
+
logger.error("cd-hit returned non-zero exit code %s", ret.returncode)
|
|
38
|
+
else:
|
|
39
|
+
logger.info("cd-hit completed successfully: %s", clustering_output)
|
|
40
|
+
except Exception as e:
|
|
41
|
+
logger.exception("Failed to run cd-hit: %s", e)
|
|
36
42
|
|
|
37
43
|
|
|
38
44
|
def main():
|
|
39
|
-
|
|
45
|
+
# Initial console-only logger so welcome and parser.description are logged before argparse outputs.
|
|
46
|
+
early_logger = configure_logger("PyamilySeq.PyamilySeq", enable_file=False, log_dir=None, verbose=False)
|
|
47
|
+
# Use LoggingArgumentParser so usage/errors are emitted via logger
|
|
48
|
+
parser = LoggingArgumentParser(logger_name="PyamilySeq.PyamilySeq")#, description="PyamilySeq entrypoint")
|
|
40
49
|
|
|
41
50
|
# Add subparsers for Full and Partial modes
|
|
42
51
|
subparsers = parser.add_subparsers(dest="run_mode", required=True, help="Choose a mode: 'Full' or 'Partial'.")
|
|
@@ -109,28 +118,28 @@ def main():
|
|
|
109
118
|
subparser.add_argument("-T", type=int, default=8, dest="threads", required=False,
|
|
110
119
|
help="Number of threads for clustering/alignment - CD-HIT parameter '-T' | MAFFT parameter '--thread'.")
|
|
111
120
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
121
|
+
# Miscellaneous Arguments
|
|
122
|
+
# Global logging options (user controls logfile creation)
|
|
123
|
+
parser.add_argument("--log", action="store_true", dest="log", help="Create a timestamped logfile for this run.")
|
|
124
|
+
parser.add_argument("--log-dir", dest="log_dir", default=None,
|
|
125
|
+
help="Directory for logfile (default: output dir or cwd).")
|
|
126
|
+
parser.add_argument("-verbose", action="store_true",
|
|
127
|
+
help="Print verbose output.")
|
|
128
|
+
parser.add_argument("-v", "--version", action="version",
|
|
129
|
+
version=f"PyamilySeq {PyamilySeq_Version}: Exiting.")
|
|
117
130
|
|
|
118
131
|
# Parse Arguments
|
|
119
132
|
options = parser.parse_args()
|
|
120
|
-
## Configuration
|
|
121
133
|
|
|
134
|
+
# Setup logger once we know output paths/options
|
|
135
|
+
# after we resolve output_path / options.output_dir:
|
|
136
|
+
resolved_log_dir = options.log_dir if getattr(options, "log_dir", None) else (os.path.abspath(options.output_dir) if getattr(options, "output_dir", None) else os.getcwd())
|
|
137
|
+
logger = configure_logger("PyamilySeq.PyamilySeq", enable_file=getattr(options, "log", False), log_dir=resolved_log_dir, verbose=options.verbose)
|
|
138
|
+
logger.info("Running PyamilySeq %s in %s mode", PyamilySeq_Version, getattr(options, "run_mode", "N/A"))
|
|
139
|
+
if options.verbose:
|
|
140
|
+
logger.debug("Options: %s", vars(options))
|
|
122
141
|
|
|
123
|
-
if options.write_groups != None and options.write_individual_groups == False:
|
|
124
|
-
options.write_individual_groups = True
|
|
125
142
|
|
|
126
|
-
# Example of conditional logic based on selected mode
|
|
127
|
-
print(f"Running PyamilySeq {PyamilySeq_Version} in {options.run_mode} mode:")
|
|
128
|
-
if options.run_mode == "Full" and options.verbose == True:
|
|
129
|
-
print("Processing Full mode with options:", vars(options))
|
|
130
|
-
elif options.run_mode == "Partial" and options.verbose == True:
|
|
131
|
-
print("Processing Partial mode with options:", vars(options))
|
|
132
|
-
|
|
133
|
-
### Checking all required parameters are provided by user #!!# Doesn't seem to work
|
|
134
143
|
if options.run_mode == 'Full':
|
|
135
144
|
options.clustering_format = 'CD-HIT'
|
|
136
145
|
if getattr(options, 'reclustered', None) is not None:
|
|
@@ -145,6 +154,7 @@ def main():
|
|
|
145
154
|
missing_options = [opt for opt in
|
|
146
155
|
['input_type', 'input_dir', 'name_split_gff', 'clustering_format', 'pident', 'len_diff'] if
|
|
147
156
|
not options.__dict__.get(opt)]
|
|
157
|
+
logger.error("Missing required options for Full mode: %s", ', '.join(missing_options))
|
|
148
158
|
sys.exit(f"Missing required options for Full mode: {', '.join(missing_options)}")
|
|
149
159
|
if options.align_core:
|
|
150
160
|
options.write_individual_groups = True
|
|
@@ -176,34 +186,28 @@ def main():
|
|
|
176
186
|
##MAFFT
|
|
177
187
|
if options.align_core == True:
|
|
178
188
|
if is_tool_installed('mafft'):
|
|
179
|
-
|
|
180
|
-
print("mafft is installed. Proceeding with alignment.")
|
|
189
|
+
logger.info("mafft is installed. Proceeding with alignment.")
|
|
181
190
|
else:
|
|
191
|
+
logger.error("mafft is not installed. Please install mafft to proceed.")
|
|
182
192
|
exit("mafft is not installed. Please install mafft to proceed.")
|
|
183
193
|
##CD-HIT
|
|
184
194
|
if options.run_mode == 'Full':
|
|
185
195
|
if is_tool_installed('cd-hit'):
|
|
186
|
-
|
|
187
|
-
print("cd-hit is installed. Proceeding with clustering.")
|
|
196
|
+
logger.info("cd-hit is installed. Proceeding with clustering.")
|
|
188
197
|
if options.sequence_type == 'DNA':
|
|
189
198
|
clustering_mode = 'cd-hit-est'
|
|
190
199
|
elif options.sequence_type == 'AA':
|
|
191
200
|
clustering_mode = 'cd-hit'
|
|
192
201
|
if options.fast_mode == True:
|
|
193
202
|
options.fast_mode = 1
|
|
194
|
-
|
|
195
|
-
print("Running CD-HIT in fast mode.")
|
|
203
|
+
logger.info("Running CD-HIT in fast mode.")
|
|
196
204
|
else:
|
|
197
205
|
options.fast_mode = 0
|
|
198
|
-
|
|
199
|
-
print("Running CD-HIT in accurate mode.")
|
|
206
|
+
logger.info("Running CD-HIT in accurate mode.")
|
|
200
207
|
else:
|
|
208
|
+
logger.error("cd-hit is not installed. Please install cd-hit to proceed.")
|
|
201
209
|
exit("cd-hit is not installed. Please install cd-hit to proceed.")
|
|
202
210
|
|
|
203
|
-
|
|
204
|
-
# if options.write_groups != None and options.original_fasta == False:
|
|
205
|
-
# exit("-fasta must br provided if -w is used")
|
|
206
|
-
|
|
207
211
|
if hasattr(options, 'cluster_file') and options.cluster_file:
|
|
208
212
|
options.cluster_file = fix_path(options.cluster_file)
|
|
209
213
|
if hasattr(options, 'reclustered') and options.reclustered:
|
|
@@ -308,10 +312,29 @@ def main():
|
|
|
308
312
|
|
|
309
313
|
|
|
310
314
|
if options.group_mode == 'Species':
|
|
311
|
-
|
|
315
|
+
try:
|
|
316
|
+
species_cluster(clustering_options)
|
|
317
|
+
logger.info("Invoked species clustering.")
|
|
318
|
+
except FileNotFoundError as e:
|
|
319
|
+
logger.error("File not found during species clustering: %s", e)
|
|
320
|
+
logger.debug("Traceback:\n%s", traceback.format_exc())
|
|
321
|
+
sys.exit(1)
|
|
322
|
+
except Exception as e:
|
|
323
|
+
logger.error("Unexpected error during species clustering: %s", e)
|
|
324
|
+
logger.debug("Traceback:\n%s", traceback.format_exc())
|
|
325
|
+
sys.exit(1)
|
|
312
326
|
elif options.group_mode == 'Genus':
|
|
313
|
-
|
|
314
|
-
|
|
327
|
+
try:
|
|
328
|
+
genus_cluster(clustering_options)
|
|
329
|
+
logger.info("Invoked genus clustering.")
|
|
330
|
+
except FileNotFoundError as e:
|
|
331
|
+
logger.error("File not found during genus clustering: %s", e)
|
|
332
|
+
logger.debug("Traceback:\n%s", traceback.format_exc())
|
|
333
|
+
sys.exit(1)
|
|
334
|
+
except Exception as e:
|
|
335
|
+
logger.error("Unexpected error during genus clustering: %s", e)
|
|
336
|
+
logger.debug("Traceback:\n%s", traceback.format_exc())
|
|
337
|
+
sys.exit(1)
|
|
315
338
|
|
|
316
339
|
# Save arguments to a text file
|
|
317
340
|
from datetime import datetime
|
|
@@ -319,9 +342,9 @@ def main():
|
|
|
319
342
|
outfile.write(f"Timestamp: {datetime.now().isoformat()}\n")
|
|
320
343
|
for arg, value in vars(options).items():
|
|
321
344
|
outfile.write(f"{arg}: {value}\n")
|
|
345
|
+
logger.info("Saved parameters to %s", os.path.join(output_path, "PyamilySeq_params.txt"))
|
|
346
|
+
|
|
322
347
|
|
|
323
|
-
print("Thank you for using PyamilySeq -- A detailed user manual can be found at https://github.com/NickJD/PyamilySeq\n"
|
|
324
|
-
"Please report any issues to: https://github.com/NickJD/PyamilySeq/issues\n#####")
|
|
325
348
|
|
|
326
349
|
if __name__ == "__main__":
|
|
327
350
|
main()
|
PyamilySeq/PyamilySeq_Genus.py
CHANGED
|
@@ -88,7 +88,7 @@ def calc_single_First_extended_Second_only_core(cluster, First_num, cores, Secon
|
|
|
88
88
|
except KeyError:
|
|
89
89
|
cores['extended_genera_>'].append(cluster)
|
|
90
90
|
#@profile
|
|
91
|
-
def calc_multi_First_extended_Second_only_core(cluster, First_num, cores, Second_num): # Count seperately those gene families extended with
|
|
91
|
+
def calc_multi_First_extended_Second_only_core(cluster, First_num, cores, Second_num): # Count seperately those gene families extended with StORF-Reporter but combined >1 PEP
|
|
92
92
|
group = First_num + Second_num
|
|
93
93
|
try:
|
|
94
94
|
cores['combined_genera_' + str(group)].append(cluster)
|
PyamilySeq/PyamilySeq_Species.py
CHANGED
|
@@ -9,7 +9,7 @@ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
|
|
|
9
9
|
from utils import *
|
|
10
10
|
|
|
11
11
|
|
|
12
|
-
def gene_presence_absence_output(options, genome_dict,
|
|
12
|
+
def gene_presence_absence_output(options, genome_dict,
|
|
13
13
|
pangenome_clusters_First_sequences_sorted,
|
|
14
14
|
combined_pangenome_clusters_First_Second_clustered=None,
|
|
15
15
|
combined_pangenome_clusters_Second_sequences_sorted=None):
|
|
@@ -137,48 +137,6 @@ def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_
|
|
|
137
137
|
if options.reclustered is not None:
|
|
138
138
|
print(f"Merged Second cluster IDs: {len(merged_second_cluster_ids)}")
|
|
139
139
|
|
|
140
|
-
# def gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sorted, pangenome_clusters_First_sequences_sorted):
|
|
141
|
-
# print("Outputting gene_presence_absence file")
|
|
142
|
-
# output_dir = os.path.abspath(options.output_dir)
|
|
143
|
-
# #in_name = options.clusters.split('.')[0].split('/')[-1]
|
|
144
|
-
# gpa_outfile = os.path.join(output_dir, 'gene_presence_absence.csv')
|
|
145
|
-
# gpa_outfile = open(gpa_outfile, 'w')
|
|
146
|
-
# genome_dict = OrderedDict(sorted(genome_dict.items()))
|
|
147
|
-
# gpa_outfile.write('"Gene","Non-unique Gene name","Annotation","No. isolates","No. sequences","Avg sequences per isolate","Genome Fragment","Order within Fragment",'
|
|
148
|
-
# '"Accessory Fragment","Accessory Order with Fragment","QC","Min group size nuc","Max group size nuc","Avg group size nuc","')
|
|
149
|
-
# gpa_outfile.write('","'.join(genome_dict.keys()))
|
|
150
|
-
# gpa_outfile.write('"\n')
|
|
151
|
-
# for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
|
|
152
|
-
# average_sequences_per_genome = len(sequences) / len(pangenome_clusters_First_sorted[cluster])
|
|
153
|
-
# gpa_outfile.write('"group_'+str(cluster)+'","","","'+str(len(pangenome_clusters_First_sorted[cluster]))+'","'+str(len(sequences))+'","'+str(average_sequences_per_genome)+
|
|
154
|
-
# '","","","","","","","",""')
|
|
155
|
-
#
|
|
156
|
-
#
|
|
157
|
-
# for genome in genome_dict.keys():
|
|
158
|
-
# full_out = ''
|
|
159
|
-
# tmp_list = []
|
|
160
|
-
# for value in sequences:
|
|
161
|
-
# if value.split('|')[0] == genome:
|
|
162
|
-
# tmp_list.append(value.split('|')[1])
|
|
163
|
-
# if tmp_list:
|
|
164
|
-
# full_out += ',"'+' '.join(tmp_list)+'"'
|
|
165
|
-
# else:
|
|
166
|
-
# full_out = ',""'
|
|
167
|
-
# gpa_outfile.write(full_out)
|
|
168
|
-
# gpa_outfile.write('\n')
|
|
169
|
-
|
|
170
|
-
### Below is some unfinished code
|
|
171
|
-
# edge_list_outfile = open(in_name+'_edge_list.csv','w')
|
|
172
|
-
# for cluster, sequences in pangenome_clusters_First_sequences_sorted.items():
|
|
173
|
-
# output = []
|
|
174
|
-
# for entry in sequences:
|
|
175
|
-
# # Split each entry at '|'
|
|
176
|
-
# genome, gene = entry.split('|')
|
|
177
|
-
# # Format the result as "gene genome"
|
|
178
|
-
# output.append(f"{gene}\t{genome}")
|
|
179
|
-
# for line in output:
|
|
180
|
-
# edge_list_outfile.write(line + '\n')
|
|
181
|
-
|
|
182
140
|
|
|
183
141
|
|
|
184
142
|
|
|
@@ -209,7 +167,7 @@ def get_cores(options,genome_dict):
|
|
|
209
167
|
cores[only_second_core_group] = []
|
|
210
168
|
return cores, groups
|
|
211
169
|
|
|
212
|
-
|
|
170
|
+
|
|
213
171
|
def calc_First_only_core(cluster, First_num, groups, cores):
|
|
214
172
|
groups_as_list = list(groups.values())
|
|
215
173
|
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= int(First_num) <= fir):
|
|
@@ -217,7 +175,7 @@ def calc_First_only_core(cluster, First_num, groups, cores):
|
|
|
217
175
|
family_group = list(groups)[res]
|
|
218
176
|
cores['First_core_'+family_group].append(cluster)
|
|
219
177
|
|
|
220
|
-
|
|
178
|
+
|
|
221
179
|
def calc_single_First_extended_Second_only_core(cluster, First_num, groups, cores, Second_num): # Count gene families extended with StORFs
|
|
222
180
|
groups_as_list = list(groups.values())
|
|
223
181
|
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= First_num+Second_num <= fir):
|
|
@@ -227,8 +185,8 @@ def calc_single_First_extended_Second_only_core(cluster, First_num, groups, core
|
|
|
227
185
|
cores['extended_core_' + family_group].append(cluster)
|
|
228
186
|
|
|
229
187
|
|
|
230
|
-
|
|
231
|
-
def calc_multi_First_extended_Second_only_core(cluster, First_num, groups, cores, Second_num): # Count seperately those gene families extended with
|
|
188
|
+
|
|
189
|
+
def calc_multi_First_extended_Second_only_core(cluster, First_num, groups, cores, Second_num): # Count seperately those gene families extended with StORF-Reporter but combined >1 PEP
|
|
232
190
|
groups_as_list = list(groups.values())
|
|
233
191
|
# Looping through the list to find the matching condition
|
|
234
192
|
for idx, (sec, fir) in enumerate(groups_as_list):
|
|
@@ -239,7 +197,7 @@ def calc_multi_First_extended_Second_only_core(cluster, First_num, groups, cores
|
|
|
239
197
|
cores['combined_core_' + family_group].append(cluster)
|
|
240
198
|
|
|
241
199
|
|
|
242
|
-
|
|
200
|
+
|
|
243
201
|
def calc_Second_only_core(cluster, Second_num, groups, cores):
|
|
244
202
|
groups_as_list = list(groups.values())
|
|
245
203
|
for idx in (idx for idx, (sec, fir) in enumerate(groups_as_list) if sec <= Second_num <= fir):
|
|
@@ -247,7 +205,7 @@ def calc_Second_only_core(cluster, Second_num, groups, cores):
|
|
|
247
205
|
family_group = list(groups)[res]
|
|
248
206
|
cores['Second_core_' + family_group].append(cluster)
|
|
249
207
|
|
|
250
|
-
|
|
208
|
+
|
|
251
209
|
def calc_only_Second_only_core(cluster, Second_num, groups, cores): # only count the true storf onlies
|
|
252
210
|
try:
|
|
253
211
|
groups_as_list = list(groups.values())
|
|
@@ -259,7 +217,7 @@ def calc_only_Second_only_core(cluster, Second_num, groups, cores): # only count
|
|
|
259
217
|
sys.exit("Error in calc_only_Second_only_core")
|
|
260
218
|
|
|
261
219
|
|
|
262
|
-
|
|
220
|
+
|
|
263
221
|
def cluster(options):
|
|
264
222
|
|
|
265
223
|
if options.cluster_format == 'CD-HIT':
|
|
@@ -273,18 +231,17 @@ def cluster(options):
|
|
|
273
231
|
cores, groups = get_cores(options, genome_dict)
|
|
274
232
|
###
|
|
275
233
|
|
|
276
|
-
if options.reclustered != None: #
|
|
234
|
+
if options.reclustered != None: # Combined clustering
|
|
277
235
|
if options.cluster_format == 'CD-HIT':
|
|
278
236
|
combined_pangenome_clusters_First_Second_clustered, not_Second_only_cluster_ids, combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences = combined_clustering_CDHIT(options, genome_dict, '|')
|
|
279
237
|
elif 'TSV' in options.cluster_format or 'CSV' in options.cluster_format:
|
|
280
|
-
#Fix
|
|
281
238
|
combined_pangenome_clusters_First_Second_clustered, not_Second_only_cluster_ids, combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences = combined_clustering_Edge_List(options, '|')
|
|
282
239
|
|
|
283
240
|
pangenome_clusters_Type = combined_clustering_counting(options, pangenome_clusters_First, reps, combined_pangenome_clusters_First_Second_clustered, pangenome_clusters_First_genomes, combined_pangenome_clusters_Second, combined_pangenome_clusters_Second_sequences, '|')
|
|
284
241
|
|
|
285
242
|
# Sort First clusters
|
|
286
243
|
sorted_First_keys = sort_keys_by_values(pangenome_clusters_First, pangenome_clusters_First_sequences)
|
|
287
|
-
pangenome_clusters_First_sorted = reorder_dict_by_keys(pangenome_clusters_First, sorted_First_keys)
|
|
244
|
+
#pangenome_clusters_First_sorted = reorder_dict_by_keys(pangenome_clusters_First, sorted_First_keys)
|
|
288
245
|
pangenome_clusters_First_sequences_sorted = reorder_dict_by_keys(pangenome_clusters_First_sequences, sorted_First_keys)
|
|
289
246
|
pangenome_clusters_Type_sorted = reorder_dict_by_keys(pangenome_clusters_Type, sorted_First_keys)
|
|
290
247
|
|
|
@@ -296,7 +253,7 @@ def cluster(options):
|
|
|
296
253
|
else:
|
|
297
254
|
pangenome_clusters_Type = single_clustering_counting(pangenome_clusters_First, reps)
|
|
298
255
|
sorted_First_keys = sort_keys_by_values(pangenome_clusters_First, pangenome_clusters_First_sequences)
|
|
299
|
-
pangenome_clusters_First_sorted = reorder_dict_by_keys(pangenome_clusters_First, sorted_First_keys)
|
|
256
|
+
#pangenome_clusters_First_sorted = reorder_dict_by_keys(pangenome_clusters_First, sorted_First_keys)
|
|
300
257
|
pangenome_clusters_First_sequences_sorted = reorder_dict_by_keys(pangenome_clusters_First_sequences,
|
|
301
258
|
sorted_First_keys)
|
|
302
259
|
pangenome_clusters_Type_sorted = reorder_dict_by_keys(pangenome_clusters_Type, sorted_First_keys)
|
|
@@ -375,17 +332,16 @@ def cluster(options):
|
|
|
375
332
|
if options.gene_presence_absence_out != False:
|
|
376
333
|
if options.reclustered != None:
|
|
377
334
|
# Pass both First and Second clustering data
|
|
378
|
-
gene_presence_absence_output(options, genome_dict,
|
|
335
|
+
gene_presence_absence_output(options, genome_dict,
|
|
379
336
|
pangenome_clusters_First_sequences_sorted,
|
|
380
337
|
combined_pangenome_clusters_First_Second_clustered,
|
|
381
338
|
combined_pangenome_clusters_Second_sequences_sorted)
|
|
382
339
|
else:
|
|
383
340
|
# Only First clustering data available
|
|
384
|
-
gene_presence_absence_output(options, genome_dict,
|
|
385
|
-
pangenome_clusters_First_sequences_sorted)
|
|
341
|
+
gene_presence_absence_output(options, genome_dict, pangenome_clusters_First_sequences_sorted)
|
|
386
342
|
|
|
387
343
|
|
|
388
|
-
###Need to fix this below. If full/partial the ifs need to be different. If full we first need to output the gfs then align. if -
|
|
344
|
+
###Need to fix this below. If full/partial the ifs need to be different. If full we first need to output the gfs then align. if -write-groups not presented then it needs
|
|
389
345
|
# to be done for alignment full anyway...
|
|
390
346
|
|
|
391
347
|
genome_list = list(genome_dict.keys())
|
|
@@ -400,17 +356,24 @@ def cluster(options):
|
|
|
400
356
|
outfile.write('>group_'+str(cluster)+'\n')
|
|
401
357
|
wrapped_aa_seq = wrap_sequence(sequences[ids[0]], 60)
|
|
402
358
|
outfile.write(wrapped_aa_seq+'\n')
|
|
403
|
-
if options.write_groups !=
|
|
359
|
+
if options.write_groups != False:
|
|
404
360
|
print("Outputting gene group FASTA files")
|
|
405
361
|
#output_dir = os.path.dirname(os.path.abspath(options.output_dir))
|
|
406
362
|
output_dir = os.path.join(options.output_dir, 'Gene_Groups_Output')
|
|
407
363
|
write_groups_func(options,output_dir, key_order, cores, sequences,
|
|
408
364
|
pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
|
|
409
365
|
|
|
410
|
-
if options.align_core !=
|
|
366
|
+
if options.align_core != False:
|
|
411
367
|
print("Processing gene group alignment")
|
|
412
368
|
process_gene_groups(options, output_dir, None, None, genome_list, 'core_gene_alignment.aln')
|
|
413
369
|
|
|
370
|
+
if options.write_individual_groups == True:
|
|
371
|
+
output_dir = os.path.join(options.output_dir, 'Gene_Groups_Output')
|
|
372
|
+
write_individual_groups(options, output_dir, key_order, cores, sequences,
|
|
373
|
+
pangenome_clusters_First_sequences_sorted,
|
|
374
|
+
combined_pangenome_clusters_Second_sequences)
|
|
375
|
+
|
|
376
|
+
|
|
414
377
|
elif options.run_mode == 'Partial':
|
|
415
378
|
sequences = read_fasta(options.fasta)
|
|
416
379
|
if options.reclustered == None:
|
|
@@ -432,16 +395,21 @@ def cluster(options):
|
|
|
432
395
|
outfile.write('>group_'+str(cluster)+'\n')
|
|
433
396
|
wrapped_aa_seq = wrap_sequence(sequences[ids[0]], 60)
|
|
434
397
|
outfile.write(wrapped_aa_seq+'\n')
|
|
435
|
-
if options.write_groups !=
|
|
398
|
+
if options.write_groups != False:
|
|
436
399
|
print("Outputting gene group FASTA files")
|
|
437
400
|
output_dir = os.path.join(options.output_dir, 'Gene_Groups_Output')
|
|
438
401
|
write_groups_func(options,output_dir, key_order, cores, sequences,
|
|
439
402
|
pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences)
|
|
440
403
|
|
|
441
|
-
if options.align_core !=
|
|
404
|
+
if options.align_core != False:
|
|
442
405
|
print("Processing gene group alignment")
|
|
443
406
|
process_gene_groups(options, output_dir, None, None, genome_list, 'core_gene_alignment.aln')
|
|
444
407
|
|
|
408
|
+
if options.write_individual_groups == True:
|
|
409
|
+
output_dir = os.path.join(options.output_dir, 'Gene_Groups_Output')
|
|
410
|
+
write_individual_groups(options, output_dir, key_order, cores, sequences,
|
|
411
|
+
pangenome_clusters_First_sequences_sorted,
|
|
412
|
+
combined_pangenome_clusters_Second_sequences)
|
|
445
413
|
|
|
446
414
|
|
|
447
415
|
#
|
|
@@ -461,4 +429,3 @@ def cluster(options):
|
|
|
461
429
|
#
|
|
462
430
|
#
|
|
463
431
|
#
|
|
464
|
-
|
PyamilySeq/Seq_Combiner.py
CHANGED
|
@@ -1,6 +1,3 @@
|
|
|
1
|
-
import argparse
|
|
2
|
-
|
|
3
|
-
|
|
4
1
|
try:
|
|
5
2
|
from .constants import *
|
|
6
3
|
from .utils import *
|
|
@@ -8,10 +5,94 @@ except (ModuleNotFoundError, ImportError, NameError, TypeError) as error:
|
|
|
8
5
|
from constants import *
|
|
9
6
|
from utils import *
|
|
10
7
|
|
|
8
|
+
import threading
|
|
9
|
+
import time
|
|
10
|
+
import os
|
|
11
|
+
from typing import Optional
|
|
12
|
+
import re
|
|
13
|
+
|
|
14
|
+
def count_matching_files(input_dir: str, name_split: Optional[str], extensions):
|
|
15
|
+
"""
|
|
16
|
+
Count input files in input_dir that match the provided extensions and, if name_split supplied,
|
|
17
|
+
contain the name_split substring in the filename. This is used to compute total work units (files).
|
|
18
|
+
"""
|
|
19
|
+
if not input_dir or not os.path.isdir(input_dir):
|
|
20
|
+
return 0
|
|
21
|
+
total = 0
|
|
22
|
+
for fname in os.listdir(input_dir):
|
|
23
|
+
low = fname.lower()
|
|
24
|
+
if any(low.endswith(ext) for ext in extensions):
|
|
25
|
+
if name_split:
|
|
26
|
+
if name_split in fname:
|
|
27
|
+
total += 1
|
|
28
|
+
else:
|
|
29
|
+
total += 1
|
|
30
|
+
return total
|
|
31
|
+
|
|
32
|
+
def count_files_present_in_combined(combined_file: str, name_split: Optional[str]) -> int:
|
|
33
|
+
"""
|
|
34
|
+
Heuristic: count number of distinct input files (genomes) already present in the combined output.
|
|
35
|
+
Primary approach: parse headers and take the second '|' field (header.split('|')[1]) as genome/file id.
|
|
36
|
+
If that parsing fails, look for tokens containing name_split inside the header.
|
|
37
|
+
"""
|
|
38
|
+
if not combined_file or not os.path.exists(combined_file):
|
|
39
|
+
return 0
|
|
40
|
+
seen = set()
|
|
41
|
+
try:
|
|
42
|
+
with open(combined_file, 'r') as fh:
|
|
43
|
+
for line in fh:
|
|
44
|
+
if not line.startswith('>'):
|
|
45
|
+
continue
|
|
46
|
+
header = line[1:].strip()
|
|
47
|
+
# 1) Prefer headers like ">id|genome|rest" -> take genome (second field)
|
|
48
|
+
if '|' in header:
|
|
49
|
+
parts = header.split('|')
|
|
50
|
+
if len(parts) > 1 and parts[1]:
|
|
51
|
+
seen.add(parts[1])
|
|
52
|
+
continue
|
|
53
|
+
# 2) If name_split provided, look for a filename-like token that includes it
|
|
54
|
+
if name_split:
|
|
55
|
+
match = re.search(r'([^\s/\\]*' + re.escape(name_split) + r'[^\s/\\]*)', header)
|
|
56
|
+
if match:
|
|
57
|
+
token = os.path.basename(match.group(1))
|
|
58
|
+
seen.add(token)
|
|
59
|
+
continue
|
|
60
|
+
# 3) If nothing matched, skip this header (avoids per-sequence overcounting)
|
|
61
|
+
except Exception:
|
|
62
|
+
return 0
|
|
63
|
+
return len(seen)
|
|
64
|
+
|
|
65
|
+
# Helpers for progress reporting
|
|
11
66
|
|
|
67
|
+
def progress_reporter(stop_event, logger, total_files, combined_file, name_split=None, interval=10):
|
|
68
|
+
"""
|
|
69
|
+
Periodically log progress. Preference: count headers in combined_file.
|
|
70
|
+
Falls back to simple heartbeat if combined_file isn't yet created.
|
|
71
|
+
"""
|
|
72
|
+
start = time.time()
|
|
73
|
+
while not stop_event.is_set():
|
|
74
|
+
# Use number of distinct input files represented in the combined output for "processed"
|
|
75
|
+
processed = count_files_present_in_combined(combined_file, name_split) if combined_file else 0
|
|
76
|
+
# Cap processed to total_files (prevents >100%)
|
|
77
|
+
if total_files > 0 and processed > total_files:
|
|
78
|
+
processed = total_files
|
|
79
|
+
pct = (processed / total_files * 100) if total_files > 0 else 0.0
|
|
80
|
+
elapsed = time.time() - start
|
|
81
|
+
logger.info("Progress: %d/%d processed (%.1f%%). Elapsed: %.0fs", processed, total_files, pct, elapsed)
|
|
82
|
+
# Wait with early exit support
|
|
83
|
+
stop_event.wait(interval)
|
|
84
|
+
# Final log when exiting
|
|
85
|
+
processed = count_files_present_in_combined(combined_file, name_split) if combined_file else 0
|
|
86
|
+
if total_files > 0 and processed > total_files:
|
|
87
|
+
processed = total_files
|
|
88
|
+
pct = (processed / total_files * 100) if total_files > 0 else 0.0
|
|
89
|
+
elapsed = time.time() - start
|
|
90
|
+
logger.info("Final progress: %d/%d processed (%.1f%%). Total elapsed: %.0fs", processed, total_files, pct, elapsed)
|
|
12
91
|
|
|
13
92
|
def main():
|
|
14
|
-
|
|
93
|
+
# Early console-only logger so parser.description is logged before help/usage.
|
|
94
|
+
early_logger = configure_logger("PyamilySeq.Seq_Combiner", enable_file=False, log_dir=None, verbose=False)
|
|
95
|
+
parser = LoggingArgumentParser(logger_name="PyamilySeq.Seq_Combiner", description='Running Seq-Combiner - A tool to extract sequences from GFF/FASTA files and prepare them for PyamilySeq.')
|
|
15
96
|
### Required Arguments
|
|
16
97
|
required = parser.add_argument_group('Required Arguments')
|
|
17
98
|
required.add_argument('-input_dir', action='store', dest='input_dir',
|
|
@@ -47,37 +128,66 @@ def main():
|
|
|
47
128
|
misc.add_argument("-v", "--version", action="version",
|
|
48
129
|
version=f"PyamilySeq: Seq-Combiner version {PyamilySeq_Version} - Exiting",
|
|
49
130
|
help="Print out version number and exit")
|
|
131
|
+
parser.add_argument("--log", action="store_true", dest="log", help="Create a timestamped logfile for this run.")
|
|
132
|
+
parser.add_argument("--log-dir", dest="log_dir", default=None, help="Directory for logfile (default: output_dir).")
|
|
50
133
|
|
|
51
134
|
options = parser.parse_args()
|
|
52
135
|
|
|
136
|
+
# Setup logger for Seq-Combiner
|
|
137
|
+
output_path = os.path.abspath(options.output_dir)
|
|
138
|
+
if not os.path.exists(output_path):
|
|
139
|
+
os.makedirs(output_path)
|
|
140
|
+
log_dir = options.log_dir if getattr(options, "log_dir", None) else output_path
|
|
141
|
+
logger = configure_logger("PyamilySeq.Seq_Combiner", enable_file=getattr(options, "log", False), log_dir=log_dir, verbose=False)
|
|
142
|
+
|
|
143
|
+
# --- Progress reporting setup ------------------------------------------------
|
|
144
|
+
combined_out_file = os.path.join(output_path, options.output_file)
|
|
145
|
+
# Determine name_split and extensions per mode and count matching input files as total work units
|
|
146
|
+
if options.input_type == 'fasta':
|
|
147
|
+
name_split = options.name_split_fasta
|
|
148
|
+
exts = ('.fasta', '.fa', '.fna')
|
|
149
|
+
else: # 'separate' or 'combined'
|
|
150
|
+
name_split = options.name_split_gff
|
|
151
|
+
exts = ('.gff', '.gff3', '.gff.gz', '.gff3.gz')
|
|
152
|
+
|
|
153
|
+
total_work = count_matching_files(options.input_dir, name_split, exts)
|
|
154
|
+
logger.info("Found %d input files (matching pattern) to process in %s", total_work, options.input_dir)
|
|
155
|
+
|
|
156
|
+
stop_event = threading.Event()
|
|
157
|
+
reporter_thread = threading.Thread(target=progress_reporter, args=(stop_event, logger, total_work, combined_out_file, name_split, 10), daemon=True)
|
|
158
|
+
reporter_thread.start()
|
|
159
|
+
# ---------------------------------------------------------------------------
|
|
53
160
|
|
|
54
161
|
if options.input_type == 'separate' and options.name_split_gff is None:
|
|
162
|
+
logger.error("Please provide a substring to split the filename and extract the genome name.")
|
|
55
163
|
print("Please provide a substring to split the filename and extract the genome name.")
|
|
56
164
|
exit(1)
|
|
57
165
|
if options.input_type == 'combined' and options.name_split_gff is None:
|
|
166
|
+
logger.error("Please provide a substring to split the filename and extract the genome name.")
|
|
58
167
|
print("Please provide a substring to split the filename and extract the genome name.")
|
|
59
168
|
exit(1)
|
|
60
169
|
if options.input_type == 'fasta' and options.name_split_fasta is None:
|
|
170
|
+
logger.error("Please provide a substring to split the filename and extract the genome name.")
|
|
61
171
|
print("Please provide a substring to split the filename and extract the genome name.")
|
|
62
172
|
exit(1)
|
|
63
173
|
|
|
64
|
-
output_path = os.path.abspath(options.output_dir)
|
|
65
|
-
if not os.path.exists(output_path):
|
|
66
|
-
os.makedirs(output_path)
|
|
67
|
-
|
|
68
|
-
#output_file = options.output_file + '.fasta'
|
|
69
|
-
if os.path.exists(os.path.join(output_path, options.output_file)):
|
|
70
|
-
print(f"Output file {options.output_file} already exists in the output directory. Please delete or rename the file and try again.")
|
|
71
|
-
exit(1)
|
|
72
|
-
|
|
73
|
-
combined_out_file = os.path.join(output_path, options.output_file )
|
|
74
|
-
|
|
75
174
|
if options.input_type == 'separate':
|
|
175
|
+
logger.info("Processing 'separate' input_type from %s", options.input_dir)
|
|
76
176
|
read_separate_files(options.input_dir, options.name_split_gff, options.name_split_fasta, options.gene_ident, combined_out_file, options.translate, True)
|
|
77
177
|
elif options.input_type == 'combined':
|
|
178
|
+
logger.info("Processing 'combined' input_type from %s", options.input_dir)
|
|
78
179
|
read_combined_files(options.input_dir, options.name_split_gff, options.gene_ident, combined_out_file, options.translate, True)
|
|
79
180
|
elif options.input_type == 'fasta':
|
|
181
|
+
logger.info("Processing 'fasta' input_type from %s", options.input_dir)
|
|
80
182
|
read_fasta_files(options.input_dir, options.name_split_fasta, combined_out_file, options.translate, True)
|
|
183
|
+
logger.info("Seq-Combiner completed.")
|
|
184
|
+
|
|
185
|
+
# Stop reporter and wait for final log
|
|
186
|
+
stop_event.set()
|
|
187
|
+
reporter_thread.join(timeout=5)
|
|
188
|
+
# Final summary: count number of input files represented (heuristic)
|
|
189
|
+
final_files = count_files_present_in_combined(combined_out_file, name_split)
|
|
190
|
+
logger.info("Completed combining. Final combined file: %s (input files represented: %d)", combined_out_file, final_files)
|
|
81
191
|
|
|
82
192
|
if __name__ == "__main__":
|
|
83
193
|
main()
|
PyamilySeq/Seq_Extractor.py
CHANGED
|
@@ -1,5 +1,12 @@
|
|
|
1
|
-
|
|
1
|
+
|
|
2
2
|
import copy
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
# Use centralised logger factory
|
|
6
|
+
try:
|
|
7
|
+
from .constants import configure_logger, LoggingArgumentParser
|
|
8
|
+
except Exception:
|
|
9
|
+
from constants import configure_logger, LoggingArgumentParser
|
|
3
10
|
|
|
4
11
|
def find_gene_ids_in_csv(csv_file, group_name):
|
|
5
12
|
"""Find gene IDs associated with the specified group name in the CSV file, starting from column 14."""
|
|
@@ -39,7 +46,10 @@ def extract_sequences(fasta_file, gene_ids):
|
|
|
39
46
|
return sequences
|
|
40
47
|
|
|
41
48
|
def main():
|
|
42
|
-
|
|
49
|
+
# Early console-only logger so parser.description appears in logger output before argparse prints the menu.
|
|
50
|
+
early_logger = configure_logger("PyamilySeq.Seq_Extractor", enable_file=False, log_dir=None, verbose=False)
|
|
51
|
+
parser = LoggingArgumentParser(logger_name="PyamilySeq.Seq_Extractor", description="Running Seq-Extractor - A tool to extract sequences for specified group name from CSV file and corresponding FASTA file.")
|
|
52
|
+
|
|
43
53
|
parser.add_argument("-csv", action='store', dest='csv_file',
|
|
44
54
|
help="CSV file containing group data", required=True)
|
|
45
55
|
parser.add_argument("-group", action='store', dest='group_name',
|
|
@@ -48,22 +58,34 @@ def main():
|
|
|
48
58
|
help="Input FASTA file containing sequences", required=True)
|
|
49
59
|
parser.add_argument("-out", action='store', dest='output_file',
|
|
50
60
|
help="Output FASTA file with extracted sequences", required=True)
|
|
61
|
+
parser.add_argument("--log", action="store_true", dest="log", help="Create a timestamped logfile for this run.")
|
|
62
|
+
parser.add_argument("--log-dir", dest="log_dir", default=None, help="Directory for logfile (default: dir of output_file).")
|
|
51
63
|
|
|
52
64
|
options = parser.parse_args()
|
|
53
65
|
|
|
66
|
+
# Setup logger
|
|
67
|
+
out_dir = os.path.abspath(os.path.dirname(options.output_file)) if options.output_file else os.getcwd()
|
|
68
|
+
log_dir = options.log_dir if getattr(options, "log_dir", None) else out_dir
|
|
69
|
+
logger = configure_logger("PyamilySeq.Seq_Extractor", enable_file=getattr(options, "log", False), log_dir=log_dir, verbose=False)
|
|
70
|
+
|
|
71
|
+
logger.info("Searching for gene IDs in CSV %s for group %s", options.csv_file, options.group_name)
|
|
72
|
+
|
|
54
73
|
# Find gene IDs in CSV
|
|
55
74
|
gene_ids = find_gene_ids_in_csv(options.csv_file, options.group_name)
|
|
56
75
|
if not gene_ids:
|
|
76
|
+
logger.warning("No gene IDs found for group name '%s' in the CSV.", options.group_name)
|
|
57
77
|
print(f"No gene IDs found for group name '{options.group_name}' in the CSV.")
|
|
58
78
|
return
|
|
59
79
|
|
|
60
80
|
# Extract sequences from the FASTA file
|
|
81
|
+
logger.info("Extracting sequences from FASTA: %s", options.fasta_file)
|
|
61
82
|
sequences = extract_sequences(options.fasta_file, gene_ids)
|
|
62
83
|
|
|
63
84
|
# Write matched sequences to the output FASTA file
|
|
64
85
|
with open(options.output_file, 'w') as output:
|
|
65
86
|
for gene_id, sequence_lines in sequences.items():
|
|
66
87
|
output.write("\n".join(sequence_lines) + "\n")
|
|
88
|
+
logger.info("Wrote %d sequences to %s", len(sequences), options.output_file)
|
|
67
89
|
|
|
68
90
|
if __name__ == "__main__":
|
|
69
91
|
main()
|