PyamilySeq 1.3.2__py3-none-any.whl → 1.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyamilySeq/Group_Compare.py +27 -13
- PyamilySeq/Group_Extractor.py +29 -12
- PyamilySeq/Group_Sizes.py +22 -8
- PyamilySeq/Group_Splitter.py +89 -29
- PyamilySeq/Group_Summary.py +18 -20
- PyamilySeq/PyamilySeq.py +66 -43
- PyamilySeq/PyamilySeq_Genus.py +1 -1
- PyamilySeq/PyamilySeq_Species.py +30 -63
- PyamilySeq/Seq_Combiner.py +125 -15
- PyamilySeq/Seq_Extractor.py +24 -2
- PyamilySeq/Seq_Finder.py +20 -2
- PyamilySeq/clusterings.py +1 -1
- PyamilySeq/constants.py +142 -1
- PyamilySeq/utils.py +171 -84
- {pyamilyseq-1.3.2.dist-info → pyamilyseq-1.3.3.dist-info}/METADATA +11 -11
- pyamilyseq-1.3.3.dist-info/RECORD +21 -0
- {pyamilyseq-1.3.2.dist-info → pyamilyseq-1.3.3.dist-info}/WHEEL +1 -1
- PyamilySeq/config.py +0 -0
- pyamilyseq-1.3.2.dist-info/RECORD +0 -22
- {pyamilyseq-1.3.2.dist-info → pyamilyseq-1.3.3.dist-info}/entry_points.txt +0 -0
- {pyamilyseq-1.3.2.dist-info → pyamilyseq-1.3.3.dist-info}/licenses/LICENSE +0 -0
- {pyamilyseq-1.3.2.dist-info → pyamilyseq-1.3.3.dist-info}/top_level.txt +0 -0
PyamilySeq/Seq_Finder.py
CHANGED
|
@@ -1,6 +1,12 @@
|
|
|
1
|
-
import argparse
|
|
2
1
|
import collections
|
|
3
2
|
import csv
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
# Use centralised logger factory
|
|
6
|
+
try:
|
|
7
|
+
from .constants import configure_logger, LoggingArgumentParser
|
|
8
|
+
except Exception:
|
|
9
|
+
from constants import configure_logger, LoggingArgumentParser
|
|
4
10
|
|
|
5
11
|
|
|
6
12
|
def parse_fasta_ids(fasta_file):
|
|
@@ -29,16 +35,27 @@ def find_ids_in_csv(ids, csv_file):
|
|
|
29
35
|
|
|
30
36
|
|
|
31
37
|
def main():
|
|
32
|
-
|
|
38
|
+
# Early console-only logger so the parser description and argparse messages are logged via logger.
|
|
39
|
+
early_logger = configure_logger("PyamilySeq.Seq_Finder", enable_file=False, log_dir=None, verbose=False)
|
|
40
|
+
parser = LoggingArgumentParser(logger_name="PyamilySeq.Seq_Finder", description="Running Seq-Finder: A tool to extract IDs from a FASTA file and search for them in a CSV file.")
|
|
41
|
+
|
|
33
42
|
parser.add_argument("-in", action='store', dest='fasta_file',
|
|
34
43
|
help="Input FASTA file", required=True)
|
|
35
44
|
parser.add_argument("-ids", action='store', dest='csv_file',
|
|
36
45
|
help="CSV file containing IDs to search for", required=True)
|
|
37
46
|
parser.add_argument("-out", action='store', dest='output_file',
|
|
38
47
|
help="Output file to save found IDs", required=True)
|
|
48
|
+
parser.add_argument("--log", action="store_true", dest="log", help="Create a timestamped logfile for this run.")
|
|
49
|
+
parser.add_argument("--log-dir", dest="log_dir", default=None, help="Directory for logfile (default: dir of output_file).")
|
|
39
50
|
|
|
40
51
|
options = parser.parse_args()
|
|
41
52
|
|
|
53
|
+
# Setup logger
|
|
54
|
+
out_dir = os.path.abspath(os.path.dirname(options.output_file)) if options.output_file else os.getcwd()
|
|
55
|
+
log_dir = options.log_dir if getattr(options, "log_dir", None) else out_dir
|
|
56
|
+
logger = configure_logger("PyamilySeq.Seq_Finder", enable_file=getattr(options, "log", False), log_dir=log_dir, verbose=False)
|
|
57
|
+
|
|
58
|
+
logger.info("Parsing FASTA IDs from %s", options.fasta_file)
|
|
42
59
|
# Parse IDs from the FASTA file
|
|
43
60
|
ids = parse_fasta_ids(options.fasta_file)
|
|
44
61
|
|
|
@@ -50,6 +67,7 @@ def main():
|
|
|
50
67
|
output.write("ID,Found_In_First_Column\n")
|
|
51
68
|
for seq_id, found_in in found_records.items():
|
|
52
69
|
output.write(f"{seq_id},{found_in}\n")
|
|
70
|
+
logger.info("Wrote found records for %d IDs to %s", len(found_records), options.output_file)
|
|
53
71
|
|
|
54
72
|
|
|
55
73
|
if __name__ == "__main__":
|
PyamilySeq/clusterings.py
CHANGED
|
@@ -441,7 +441,7 @@ def combined_clustering_Edge_List(options, splitter):
|
|
|
441
441
|
combined_pangenome_clusters_Second_sequences[str(cluster_id)].append(child)
|
|
442
442
|
else:
|
|
443
443
|
if str(cluster_id) not in not_Second_only_cluster_ids:
|
|
444
|
-
not_Second_only_cluster_ids.append(str(cluster_id)) # Tell us which
|
|
444
|
+
not_Second_only_cluster_ids.append(str(cluster_id)) # Tell us which StORF-Reporter clustered are unmatched to a PEP
|
|
445
445
|
if child_taxa not in combined_pangenome_clusters_First[str(cluster_id)]:
|
|
446
446
|
combined_pangenome_clusters_First[str(cluster_id)].append(child_taxa)
|
|
447
447
|
combined_pangenome_clusters_First_sequences[str(cluster_id)].append(child)
|
PyamilySeq/constants.py
CHANGED
|
@@ -1,2 +1,143 @@
|
|
|
1
|
-
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
import argparse
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from io import StringIO
|
|
7
|
+
import re
|
|
2
8
|
|
|
9
|
+
PyamilySeq_Version = 'v1.3.3'
|
|
10
|
+
WELCOME = f"Thank you for using PyamilySeq {PyamilySeq_Version} - A tool for gene clustering and pangenome analysis."
|
|
11
|
+
CITATION = "Please Cite PyamilySeq: https://doi.org/10.1093/nargab/lqaf198"
|
|
12
|
+
ISSUE = "Please report any issues to: https://github.com/NickJD/PyamilySeq/issues"
|
|
13
|
+
|
|
14
|
+
def configure_logger(logger_name, enable_file=False, log_dir=None, level=logging.INFO, verbose=False):
|
|
15
|
+
"""
|
|
16
|
+
Create and return a configured logger.
|
|
17
|
+
- logger_name: full logger name (e.g. "PyamilySeq.Group_Splitter")
|
|
18
|
+
- enable_file: if True, create a timestamped logfile in log_dir
|
|
19
|
+
- log_dir: directory for logfile (defaults to cwd)
|
|
20
|
+
- level: console log level (default INFO)
|
|
21
|
+
- verbose: if True, sets console level to DEBUG and file to DEBUG
|
|
22
|
+
"""
|
|
23
|
+
logger = logging.getLogger(logger_name)
|
|
24
|
+
# Clear previous handlers to avoid duplicate logs on repeated imports/runs
|
|
25
|
+
if logger.hasHandlers():
|
|
26
|
+
logger.handlers.clear()
|
|
27
|
+
|
|
28
|
+
# Determine levels
|
|
29
|
+
console_level = logging.DEBUG if verbose else level
|
|
30
|
+
logger.setLevel(logging.DEBUG if verbose else level)
|
|
31
|
+
|
|
32
|
+
# Formatter without logger name (keeps output clean)
|
|
33
|
+
formatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s")
|
|
34
|
+
|
|
35
|
+
# Console handler -> write to stdout by default
|
|
36
|
+
ch = logging.StreamHandler(sys.stdout)
|
|
37
|
+
ch.setLevel(console_level)
|
|
38
|
+
ch.setFormatter(formatter)
|
|
39
|
+
logger.addHandler(ch)
|
|
40
|
+
|
|
41
|
+
# Optional file handler
|
|
42
|
+
file_handler = None
|
|
43
|
+
if enable_file:
|
|
44
|
+
if not log_dir:
|
|
45
|
+
log_dir = os.getcwd()
|
|
46
|
+
os.makedirs(log_dir, exist_ok=True)
|
|
47
|
+
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
48
|
+
# Use short tool name derived from logger_name for the filename
|
|
49
|
+
safe_name = logger_name.split('.')[-1]
|
|
50
|
+
file_name = f"{safe_name}-{ts}.log"
|
|
51
|
+
fh = logging.FileHandler(os.path.join(log_dir, file_name))
|
|
52
|
+
fh.setLevel(logging.DEBUG) # file always capture debug for diagnostics
|
|
53
|
+
fh.setFormatter(formatter)
|
|
54
|
+
logger.addHandler(fh)
|
|
55
|
+
file_handler = fh
|
|
56
|
+
logger.debug("File logging enabled: %s", os.path.join(log_dir, file_name))
|
|
57
|
+
|
|
58
|
+
# Standard startup banner for all tools (printed once per logger instance)
|
|
59
|
+
# If banner hasn't been printed at all, log it normally (will go to console and file if present).
|
|
60
|
+
if not getattr(logger, "_welcome_printed", False):
|
|
61
|
+
logger.info("%s", WELCOME)
|
|
62
|
+
logger.info("%s", CITATION)
|
|
63
|
+
logger.info("%s", ISSUE)
|
|
64
|
+
setattr(logger, "_welcome_printed", True)
|
|
65
|
+
# Mark that banner also written to file if file handler exists
|
|
66
|
+
if file_handler:
|
|
67
|
+
# Also write formatted lines directly into the file to guarantee presence
|
|
68
|
+
try:
|
|
69
|
+
for msg in (WELCOME, CITATION, ISSUE):
|
|
70
|
+
rec = logging.LogRecord(logger.name, logging.INFO, "", 0, msg, None, None)
|
|
71
|
+
formatted = formatter.format(rec)
|
|
72
|
+
# write to file handler's stream and flush
|
|
73
|
+
try:
|
|
74
|
+
file_handler.stream.write(formatted + "\n")
|
|
75
|
+
file_handler.stream.flush()
|
|
76
|
+
except Exception:
|
|
77
|
+
# Best-effort; ignore write errors
|
|
78
|
+
pass
|
|
79
|
+
setattr(logger, "_welcome_file_written", True)
|
|
80
|
+
except Exception:
|
|
81
|
+
pass
|
|
82
|
+
else:
|
|
83
|
+
# Banner already printed (likely to console by an early logger). Ensure it is written to file
|
|
84
|
+
# if file logging was just enabled and it hasn't yet been written to file.
|
|
85
|
+
if file_handler and not getattr(logger, "_welcome_file_written", False):
|
|
86
|
+
# Write banner lines directly into the file handler's stream (avoid duplicating console output).
|
|
87
|
+
try:
|
|
88
|
+
for msg in (WELCOME, CITATION, ISSUE):
|
|
89
|
+
rec = logging.LogRecord(logger.name, logging.INFO, "", 0, msg, None, None)
|
|
90
|
+
formatted = formatter.format(rec)
|
|
91
|
+
try:
|
|
92
|
+
file_handler.stream.write(formatted + "\n")
|
|
93
|
+
file_handler.stream.flush()
|
|
94
|
+
except Exception:
|
|
95
|
+
pass
|
|
96
|
+
setattr(logger, "_welcome_file_written", True)
|
|
97
|
+
except Exception:
|
|
98
|
+
pass
|
|
99
|
+
|
|
100
|
+
return logger
|
|
101
|
+
|
|
102
|
+
# ArgumentParser subclass that logs usage/help/errors via the logger
|
|
103
|
+
class LoggingArgumentParser(argparse.ArgumentParser):
|
|
104
|
+
def __init__(self, *args, logger_name=None, **kwargs):
|
|
105
|
+
super().__init__(*args, **kwargs)
|
|
106
|
+
# If logger_name provided, use that logger; otherwise use root logger
|
|
107
|
+
self._logger = logging.getLogger(logger_name) if logger_name else logging.getLogger()
|
|
108
|
+
# Emit the parser description immediately on creation so it appears for normal runs
|
|
109
|
+
# (tools create an early console-only logger before constructing the parser).
|
|
110
|
+
if getattr(self, 'description', None):
|
|
111
|
+
try:
|
|
112
|
+
self._logger.info("%s", str(self.description))
|
|
113
|
+
except Exception:
|
|
114
|
+
# If logging fails for any reason, swallow to avoid breaking parser creation.
|
|
115
|
+
pass
|
|
116
|
+
|
|
117
|
+
def print_usage(self, file=None):
|
|
118
|
+
# Preserve default usage printing to console; description already logged at init.
|
|
119
|
+
super().print_usage(file)
|
|
120
|
+
|
|
121
|
+
def print_help(self, file=None):
|
|
122
|
+
# Capture help output, strip description (already logged), and print the rest to console.
|
|
123
|
+
sio = StringIO()
|
|
124
|
+
super().print_help(sio)
|
|
125
|
+
help_text = sio.getvalue()
|
|
126
|
+
if self.description:
|
|
127
|
+
pattern = re.escape(str(self.description)) + r'(\r?\n){1,2}'
|
|
128
|
+
help_text = re.sub(pattern, '', help_text, count=1)
|
|
129
|
+
out_file = file if file is not None else sys.stdout
|
|
130
|
+
out_file.write(help_text)
|
|
131
|
+
|
|
132
|
+
def exit(self, status=0, message=None):
|
|
133
|
+
# Preserve argparse behaviour by writing any exit message to stderr and exiting.
|
|
134
|
+
if message:
|
|
135
|
+
sys.stderr.write(message)
|
|
136
|
+
raise SystemExit(status)
|
|
137
|
+
|
|
138
|
+
def error(self, message):
|
|
139
|
+
# Print usage to stderr (as argparse does) and log a concise error message via logger.
|
|
140
|
+
super().print_usage(sys.stderr)
|
|
141
|
+
prog = self.prog if hasattr(self, 'prog') else ''
|
|
142
|
+
self._logger.error("%s: error: %s", prog, message)
|
|
143
|
+
self.exit(2)
|
PyamilySeq/utils.py
CHANGED
|
@@ -7,6 +7,25 @@ from tempfile import NamedTemporaryFile
|
|
|
7
7
|
import sys
|
|
8
8
|
import re
|
|
9
9
|
import math
|
|
10
|
+
import logging
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger("PyamilySeq") # Use the shared top-level PyamilySeq logger so all utils logs propagate to the same handlers
|
|
13
|
+
|
|
14
|
+
_startup_messages_pending = []
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def emit_pending_startup_messages():
|
|
18
|
+
global _startup_messages_pending
|
|
19
|
+
if _startup_messages_pending:
|
|
20
|
+
try:
|
|
21
|
+
for msg in _startup_messages_pending:
|
|
22
|
+
try:
|
|
23
|
+
logger.info("%s", msg)
|
|
24
|
+
except Exception:
|
|
25
|
+
# swallow any logging errors to avoid breaking flow
|
|
26
|
+
pass
|
|
27
|
+
finally:
|
|
28
|
+
_startup_messages_pending.clear()
|
|
10
29
|
|
|
11
30
|
####
|
|
12
31
|
# Placeholder for the distance function
|
|
@@ -18,7 +37,8 @@ try:
|
|
|
18
37
|
def levenshtein_distance_calc(seq1, seq2):
|
|
19
38
|
return LV.distance(seq1, seq2)
|
|
20
39
|
except (ModuleNotFoundError, ImportError):
|
|
21
|
-
|
|
40
|
+
# Save the notice for later emission (after logger handlers are configured)
|
|
41
|
+
_startup_messages_pending.append("Levenshtein package not installed - Will fallback to slower Python implementation.")
|
|
22
42
|
# Fallback implementation
|
|
23
43
|
def levenshtein_distance_calc(seq1, seq2):
|
|
24
44
|
# Slower Python implementation of Levenshtein distance
|
|
@@ -62,6 +82,11 @@ codon_table = {
|
|
|
62
82
|
'TAC':'Y', 'TAT':'Y', 'TAA':'*', 'TAG':'*',
|
|
63
83
|
'TGC':'C', 'TGT':'C', 'TGA':'*', 'TGG':'W'}
|
|
64
84
|
|
|
85
|
+
# Temp fix
|
|
86
|
+
codon_table['TAA'] = ''
|
|
87
|
+
codon_table['TGA'] = ''
|
|
88
|
+
codon_table['TAG'] = ''
|
|
89
|
+
|
|
65
90
|
def translate_frame(sequence):
|
|
66
91
|
translate = ''.join([codon_table.get(sequence[3 * i:3 * i + 3], 'X') for i in range(len(sequence) // 3)])
|
|
67
92
|
return translate
|
|
@@ -94,10 +119,15 @@ def translate_dna_to_aa(dna_fasta, aa_fasta):
|
|
|
94
119
|
|
|
95
120
|
|
|
96
121
|
def detect_sequence_type(fasta_file):
|
|
97
|
-
|
|
122
|
+
import gzip
|
|
123
|
+
opener = gzip.open if str(fasta_file).lower().endswith('.gz') else open
|
|
124
|
+
with opener(fasta_file, 'rt') as f:
|
|
98
125
|
for line in f:
|
|
99
126
|
if line.startswith('>'):
|
|
100
127
|
continue
|
|
128
|
+
line = line.strip().upper()
|
|
129
|
+
if not line:
|
|
130
|
+
continue
|
|
101
131
|
if any(base in line for base in 'EFILPQZ'):
|
|
102
132
|
return False # Contains amino acids
|
|
103
133
|
return True # Contains DNA
|
|
@@ -105,7 +135,6 @@ def detect_sequence_type(fasta_file):
|
|
|
105
135
|
|
|
106
136
|
def is_tool_installed(tool_name):
|
|
107
137
|
"""Check if a tool is installed and available in PATH."""
|
|
108
|
-
# Check if the tool is in the system PATH
|
|
109
138
|
if shutil.which(tool_name) is None:
|
|
110
139
|
return False
|
|
111
140
|
|
|
@@ -119,7 +148,9 @@ def is_tool_installed(tool_name):
|
|
|
119
148
|
return False # This shouldn't happen due to the earlier check
|
|
120
149
|
|
|
121
150
|
def reverse_complement(seq):
|
|
122
|
-
complement = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'N': 'N'
|
|
151
|
+
complement = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'N': 'N','R': 'Y',
|
|
152
|
+
'Y': 'R', 'S': 'S', 'W': 'W', 'K': 'M', 'M': 'K', 'V': 'B',
|
|
153
|
+
'B': 'V', 'H': 'D', 'D': 'H'}
|
|
123
154
|
return ''.join(complement[base] for base in reversed(seq))
|
|
124
155
|
|
|
125
156
|
|
|
@@ -196,8 +227,8 @@ def select_longest_gene(sequences, subgrouped):
|
|
|
196
227
|
|
|
197
228
|
|
|
198
229
|
def run_mafft_on_sequences(options, sequences, output_file):
|
|
199
|
-
#print("Conducting MAFFT alignment.")
|
|
200
230
|
"""Run mafft on the given sequences and write to output file."""
|
|
231
|
+
emit_pending_startup_messages()
|
|
201
232
|
# Create a temporary input file for mafft
|
|
202
233
|
with NamedTemporaryFile('w', delete=False) as temp_input_file:
|
|
203
234
|
for header, sequence in sequences.items():
|
|
@@ -207,14 +238,13 @@ def run_mafft_on_sequences(options, sequences, output_file):
|
|
|
207
238
|
# Run mafft
|
|
208
239
|
try:
|
|
209
240
|
with open(output_file, 'w') as output_f:
|
|
210
|
-
if options
|
|
241
|
+
if getattr(options, "verbose", False):
|
|
211
242
|
subprocess.run(
|
|
212
243
|
['mafft', '--auto', '--thread', str(options.threads), temp_input_file_path],
|
|
213
244
|
stdout=output_f,
|
|
214
245
|
stderr=sys.stderr,
|
|
215
246
|
check=True
|
|
216
247
|
)
|
|
217
|
-
|
|
218
248
|
else:
|
|
219
249
|
subprocess.run(
|
|
220
250
|
['mafft', '--auto', '--thread', str(options.threads), temp_input_file_path],
|
|
@@ -223,22 +253,28 @@ def run_mafft_on_sequences(options, sequences, output_file):
|
|
|
223
253
|
check=True
|
|
224
254
|
)
|
|
225
255
|
finally:
|
|
226
|
-
|
|
227
|
-
|
|
256
|
+
try:
|
|
257
|
+
os.remove(temp_input_file_path) # Clean up the temporary file
|
|
258
|
+
except Exception:
|
|
259
|
+
pass
|
|
228
260
|
|
|
229
261
|
|
|
230
262
|
|
|
231
263
|
def read_separate_files(input_dir, name_split_gff, name_split_fasta, gene_ident, combined_out, translate, run_as_combiner):
|
|
264
|
+
emit_pending_startup_messages()
|
|
232
265
|
if run_as_combiner == True:
|
|
233
|
-
|
|
266
|
+
combined_out_file_aa_path = None
|
|
234
267
|
else:
|
|
235
|
-
|
|
268
|
+
combined_out_file_aa_path = combined_out.replace('_dna.fasta','_aa.fasta')
|
|
236
269
|
|
|
237
|
-
|
|
270
|
+
# Open actual AA file or os.devnull based on whether we need an AA file path
|
|
271
|
+
aa_handle = open(combined_out_file_aa_path, 'w') if combined_out_file_aa_path else open(os.devnull, 'w')
|
|
272
|
+
with open(combined_out, 'w') as combined_out_file, aa_handle as combined_out_file_aa:
|
|
238
273
|
paired_files_found = None
|
|
239
274
|
#with open(combined_out, 'w') as combined_out_file, open(combined_out.replace('_dna.fasta','_aa.fasta'), 'w') as combined_out_file_aa:
|
|
240
275
|
gff_files = glob.glob(os.path.join(input_dir, '*' + name_split_gff))
|
|
241
276
|
if not gff_files:
|
|
277
|
+
logger.error("Error: No GFF files found in %s (pattern: *%s).", input_dir, name_split_gff)
|
|
242
278
|
sys.exit("Error: No GFF files found.")
|
|
243
279
|
for gff_file in gff_files:
|
|
244
280
|
genome_name = os.path.basename(gff_file).split(name_split_gff)[0]
|
|
@@ -251,12 +287,12 @@ def read_separate_files(input_dir, name_split_gff, name_split_fasta, gene_ident,
|
|
|
251
287
|
corresponding_fasta_file = temp_file
|
|
252
288
|
break
|
|
253
289
|
if corresponding_fasta_file is None:
|
|
254
|
-
|
|
290
|
+
logger.warning("Corresponding FASTA file for GFF file '%s' not found. Skipping. Try using the -name_split_fasta option.", gff_file)
|
|
255
291
|
continue
|
|
256
292
|
else:
|
|
257
293
|
corresponding_fasta_file = os.path.join(input_dir, genome_name + name_split_fasta)
|
|
258
294
|
if not os.path.exists(corresponding_fasta_file):
|
|
259
|
-
|
|
295
|
+
logger.warning("Corresponding FASTA file for GFF file '%s' not found: expected '%s'. Skipping. Try using the -name_split_fasta option.", gff_file, corresponding_fasta_file)
|
|
260
296
|
continue
|
|
261
297
|
|
|
262
298
|
gff_features = []
|
|
@@ -322,25 +358,30 @@ def read_separate_files(input_dir, name_split_gff, name_split_fasta, gene_ident,
|
|
|
322
358
|
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
|
|
323
359
|
|
|
324
360
|
if not paired_files_found:
|
|
361
|
+
logger.error("Could not find matching GFF/FASTA files. Please check input directory and -name_split_gff / -name_split_fasta parameters.")
|
|
325
362
|
sys.exit("Could not find matching GFF/FASTA files - Please check input directory and -name_split_gff and -name_split_fasta parameters.")
|
|
326
363
|
if translate == False or translate == None:
|
|
327
|
-
#Clean up unused file
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
os.
|
|
331
|
-
|
|
332
|
-
|
|
364
|
+
# Clean up unused file only if it was a real file we created (never remove os.devnull)
|
|
365
|
+
if combined_out_file_aa_path:
|
|
366
|
+
try:
|
|
367
|
+
if os.path.exists(combined_out_file_aa_path):
|
|
368
|
+
os.remove(combined_out_file_aa_path)
|
|
369
|
+
except Exception:
|
|
370
|
+
pass
|
|
333
371
|
|
|
334
372
|
|
|
335
373
|
def read_combined_files(input_dir, name_split, gene_ident, combined_out, translate, run_as_combiner):
|
|
374
|
+
emit_pending_startup_messages()
|
|
336
375
|
if run_as_combiner == True:
|
|
337
|
-
|
|
376
|
+
combined_out_file_aa_path = None
|
|
338
377
|
else:
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
378
|
+
combined_out_file_aa_path = combined_out.replace('_dna.fasta','_aa.fasta')
|
|
379
|
+
|
|
380
|
+
aa_handle = open(combined_out_file_aa_path, 'w') if combined_out_file_aa_path else open(os.devnull, 'w')
|
|
381
|
+
with open(combined_out, 'w') as combined_out_file, aa_handle as combined_out_file_aa:
|
|
342
382
|
gff_files = glob.glob(os.path.join(input_dir, '*' + name_split))
|
|
343
383
|
if not gff_files:
|
|
384
|
+
logger.error("Error: No GFF files found in %s (pattern: *%s).", input_dir, name_split)
|
|
344
385
|
sys.exit("Error: No GFF files found - check input directory and -name_split_gff parameter.")
|
|
345
386
|
for gff_file in gff_files:
|
|
346
387
|
genome_name = os.path.basename(gff_file).split(name_split)[0]
|
|
@@ -409,24 +450,29 @@ def read_combined_files(input_dir, name_split, gene_ident, combined_out, transla
|
|
|
409
450
|
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
|
|
410
451
|
|
|
411
452
|
if translate == False or translate == None:
|
|
412
|
-
#Clean up unused file
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
os.
|
|
416
|
-
|
|
417
|
-
|
|
453
|
+
# Clean up unused file only if it was a real file we created (never remove os.devnull)
|
|
454
|
+
if combined_out_file_aa_path:
|
|
455
|
+
try:
|
|
456
|
+
if os.path.exists(combined_out_file_aa_path):
|
|
457
|
+
os.remove(combined_out_file_aa_path)
|
|
458
|
+
except Exception:
|
|
459
|
+
pass
|
|
418
460
|
|
|
419
461
|
|
|
420
462
|
|
|
421
463
|
def read_fasta_files(input_dir, name_split_fasta, combined_out, translate, run_as_combiner):
|
|
464
|
+
emit_pending_startup_messages()
|
|
422
465
|
if run_as_combiner == True:
|
|
423
|
-
|
|
466
|
+
combined_out_file_aa_path = None
|
|
424
467
|
else:
|
|
425
|
-
|
|
426
|
-
|
|
468
|
+
combined_out_file_aa_path = combined_out.replace('_dna.fasta','_aa.fasta')
|
|
469
|
+
|
|
470
|
+
aa_handle = open(combined_out_file_aa_path, 'w') if combined_out_file_aa_path else open(os.devnull, 'w')
|
|
471
|
+
with open(combined_out, 'w') as combined_out_file, aa_handle as combined_out_file_aa:
|
|
427
472
|
fasta_files = glob.glob(os.path.join(input_dir, '*' + name_split_fasta))
|
|
428
473
|
if not fasta_files:
|
|
429
|
-
|
|
474
|
+
logger.error("Error: No FASTA files found in %s (pattern: *%s).", input_dir, name_split_fasta)
|
|
475
|
+
sys.exit("Error: No FASTA files found.")
|
|
430
476
|
for fasta_file in fasta_files:
|
|
431
477
|
genome_name = os.path.basename(fasta_file).split(name_split_fasta)[0]
|
|
432
478
|
fasta_dict = collections.defaultdict(str)
|
|
@@ -456,31 +502,63 @@ def read_fasta_files(input_dir, name_split_fasta, combined_out, translate, run_a
|
|
|
456
502
|
combined_out_file.write(f">{genome_name}|{seq_id}\n{wrapped_sequence}\n")
|
|
457
503
|
|
|
458
504
|
if translate == False or translate == None:
|
|
459
|
-
#Clean up unused file
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
os.
|
|
463
|
-
|
|
464
|
-
|
|
505
|
+
# Clean up unused file only if it was a real file we created (never remove os.devnull)
|
|
506
|
+
if combined_out_file_aa_path:
|
|
507
|
+
try:
|
|
508
|
+
if os.path.exists(combined_out_file_aa_path):
|
|
509
|
+
os.remove(combined_out_file_aa_path)
|
|
510
|
+
except Exception:
|
|
511
|
+
pass
|
|
512
|
+
|
|
513
|
+
def write_individual_groups(options, output_dir, key_order, cores, sequences,
|
|
514
|
+
pangenome_clusters_First_sequences_sorted,
|
|
515
|
+
combined_pangenome_clusters_Second_sequences):
|
|
516
|
+
if not getattr(options, "write_individual_groups", False):
|
|
517
|
+
return
|
|
518
|
+
|
|
519
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
520
|
+
|
|
521
|
+
|
|
522
|
+
for key_prefix in key_order:
|
|
523
|
+
for key, values in cores.items():
|
|
524
|
+
if not key.startswith(key_prefix):
|
|
525
|
+
continue
|
|
465
526
|
|
|
527
|
+
for value in values:
|
|
528
|
+
sequences_to_write = (pangenome_clusters_First_sequences_sorted[value]
|
|
529
|
+
if 'First' in key_prefix
|
|
530
|
+
else combined_pangenome_clusters_Second_sequences[value])
|
|
531
|
+
|
|
532
|
+
dna_path = os.path.join(output_dir, f"{key}_{value}_dna.fasta")
|
|
533
|
+
aa_path = dna_path.replace('_dna.fasta', '_aa.fasta')
|
|
534
|
+
|
|
535
|
+
if getattr(options, "sequence_type", None) == 'AA':
|
|
536
|
+
with open(dna_path, 'w') as dna_f, open(aa_path, 'w') as aa_f:
|
|
537
|
+
for header in sequences_to_write:
|
|
538
|
+
if header not in sequences:
|
|
539
|
+
if getattr(options, "verbose", False):
|
|
540
|
+
print(f"Sequence {header} not found in original_fasta file.")
|
|
541
|
+
continue
|
|
542
|
+
seq = sequences[header]
|
|
543
|
+
dna_f.write(f">{header}\n{wrap_sequence(seq)}\n")
|
|
544
|
+
aa_f.write(f">{header}\n{wrap_sequence(translate_frame(seq))}\n")
|
|
545
|
+
else:
|
|
546
|
+
with open(dna_path, 'w') as dna_f:
|
|
547
|
+
for header in sequences_to_write:
|
|
548
|
+
if header not in sequences:
|
|
549
|
+
if getattr(options, "verbose", False):
|
|
550
|
+
print(f"Sequence {header} not found in original_fasta file.")
|
|
551
|
+
continue
|
|
552
|
+
seq = sequences[header]
|
|
553
|
+
dna_f.write(f">{header}\n{wrap_sequence(seq)}\n")
|
|
466
554
|
|
|
467
555
|
def write_groups_func(options, output_dir, key_order, cores, sequences,
|
|
468
556
|
pangenome_clusters_First_sequences_sorted, combined_pangenome_clusters_Second_sequences):
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
Parameters:
|
|
473
|
-
- options: Command-line options.
|
|
474
|
-
- output_dir: Directory where output FASTA files will be saved.
|
|
475
|
-
- key_order: The order in which to process keys.
|
|
476
|
-
- cores: Dictionary of core genes.
|
|
477
|
-
- sequences: Dictionary mapping headers to sequences.
|
|
478
|
-
- pangenome_clusters_First_sequences_sorted: Dictionary of first sequence clusters.
|
|
479
|
-
- combined_pangenome_clusters_Second_sequences: Dictionary of second sequence clusters.
|
|
480
|
-
"""
|
|
557
|
+
|
|
558
|
+
emit_pending_startup_messages()
|
|
481
559
|
# Create output directory if it doesn't exist
|
|
482
560
|
if not os.path.exists(output_dir):
|
|
483
|
-
|
|
561
|
+
os.makedirs(output_dir)
|
|
484
562
|
|
|
485
563
|
for group in options.write_groups.split(','):
|
|
486
564
|
|
|
@@ -514,7 +592,10 @@ def write_groups_func(options, output_dir, key_order, cores, sequences,
|
|
|
514
592
|
outfile_aa.write(f">{header}\n")
|
|
515
593
|
outfile_aa.write(f"{wrapped_sequence_aa}\n")
|
|
516
594
|
else:
|
|
517
|
-
|
|
595
|
+
try:
|
|
596
|
+
os.remove(outfile_aa.name) # Delete individual file if option is disabled
|
|
597
|
+
except FileNotFoundError:
|
|
598
|
+
pass
|
|
518
599
|
# Always write to the combined AA file
|
|
519
600
|
combined_fasta_aa.write(f">Group_{value}|{header}\n")
|
|
520
601
|
combined_fasta_aa.write(f"{wrapped_sequence_aa}\n")
|
|
@@ -530,18 +611,24 @@ def write_groups_func(options, output_dir, key_order, cores, sequences,
|
|
|
530
611
|
outfile.write(f">{header}\n")
|
|
531
612
|
outfile.write(f"{wrapped_sequence}\n")
|
|
532
613
|
else:
|
|
533
|
-
|
|
614
|
+
try:
|
|
615
|
+
os.remove(outfile.name) # Delete individual file if option is disabled
|
|
616
|
+
except FileNotFoundError:
|
|
617
|
+
pass
|
|
534
618
|
# Always write to the combined nucleotide file
|
|
535
619
|
combined_fasta.write(f">Group_{value}|{header}\n")
|
|
536
620
|
combined_fasta.write(f"{wrapped_sequence}\n")
|
|
537
621
|
|
|
538
622
|
else:
|
|
539
623
|
if options.verbose == True:
|
|
540
|
-
|
|
624
|
+
logger.info("Sequence " + header + " not found in original_fasta file.")
|
|
541
625
|
if options.sequence_type != 'AA':
|
|
542
626
|
#Clean up unused file
|
|
543
|
-
|
|
544
|
-
|
|
627
|
+
try:
|
|
628
|
+
os.remove(combined_fasta_aa.name)
|
|
629
|
+
except FileNotFoundError:
|
|
630
|
+
pass
|
|
631
|
+
logger.info("Combined FASTA file saved to: " + combined_fasta_filename)
|
|
545
632
|
|
|
546
633
|
|
|
547
634
|
# def process_gene_groups(options, group_directory, sub_group_directory, paralog_groups, output_file):
|
|
@@ -612,38 +699,38 @@ def perform_alignment(gene_path,group_directory, gene_file, options, concatenate
|
|
|
612
699
|
return concatenated_sequences
|
|
613
700
|
|
|
614
701
|
def process_gene_groups(options, group_directory, sub_group_directory, paralog_groups, genome_list, output_file):
|
|
702
|
+
emit_pending_startup_messages()
|
|
615
703
|
"""Process each gene family file to select the longest sequence per genome and concatenate aligned sequences."""
|
|
616
704
|
concatenated_sequences = {genome: "" for genome in genome_list}
|
|
617
705
|
output_file = group_directory.replace('Gene_Groups_Output', output_file)
|
|
618
706
|
if paralog_groups != None:
|
|
619
|
-
|
|
707
|
+
threshold_size = math.floor(int(options.align_core) * int(options.genome_num) / 100)
|
|
620
708
|
|
|
621
709
|
if options.align_aa == True:
|
|
622
|
-
|
|
710
|
+
affix = '_aa.fasta'
|
|
623
711
|
else:
|
|
624
|
-
|
|
712
|
+
affix = '_dna.fasta'
|
|
625
713
|
|
|
626
714
|
if options.align_core == True:
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
715
|
+
# Iterate over each gene family file
|
|
716
|
+
for gene_file in os.listdir(group_directory):
|
|
717
|
+
if gene_file.endswith(affix) and not gene_file.startswith('combined_group_sequences'):
|
|
718
|
+
current_group = int(gene_file.split('_')[3].split('.')[0])
|
|
719
|
+
gene_path = os.path.join(group_directory, gene_file)
|
|
720
|
+
# Could add more catches here to work with First and Secondary groups - This ensures only core '99/100' are aligned
|
|
721
|
+
if 'First_core_99' in gene_file or 'First_core_100' in gene_file:
|
|
722
|
+
# Check for matching group in paralog_groups
|
|
723
|
+
if sub_group_directory and paralog_groups and '>Group_'+str(current_group) in paralog_groups:
|
|
724
|
+
for subgroup, size in enumerate(paralog_groups['>Group_' + str(current_group)]['sizes']):
|
|
725
|
+
if size >= threshold_size:
|
|
726
|
+
gene_path = os.path.join(sub_group_directory,f"Group_{current_group}_subgroup_{subgroup}{affix}")
|
|
727
|
+
concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, True)
|
|
728
|
+
else:
|
|
729
|
+
concatenated_sequences = perform_alignment(gene_path, group_directory, gene_file, options, concatenated_sequences, False)
|
|
730
|
+
|
|
731
|
+
# Write the concatenated sequences to the output file
|
|
732
|
+
with open(output_file, 'w') as out:
|
|
733
|
+
for genome, sequence in concatenated_sequences.items():
|
|
734
|
+
out.write(f">{genome}\n")
|
|
735
|
+
wrapped_sequence = wrap_sequence(sequence, 60)
|
|
736
|
+
out.write(f"{wrapped_sequence}\n")
|