moducomp 0.7.7__py3-none-any.whl → 0.7.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- moducomp/__init__.py +1 -1
- moducomp/moducomp.py +432 -269
- {moducomp-0.7.7.dist-info → moducomp-0.7.9.dist-info}/METADATA +5 -3
- {moducomp-0.7.7.dist-info → moducomp-0.7.9.dist-info}/RECORD +7 -7
- {moducomp-0.7.7.dist-info → moducomp-0.7.9.dist-info}/WHEEL +0 -0
- {moducomp-0.7.7.dist-info → moducomp-0.7.9.dist-info}/entry_points.txt +0 -0
- {moducomp-0.7.7.dist-info → moducomp-0.7.9.dist-info}/licenses/LICENSE.txt +0 -0
moducomp/moducomp.py
CHANGED
|
@@ -44,6 +44,57 @@ from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
|
|
44
44
|
import pandas as pd
|
|
45
45
|
import typer
|
|
46
46
|
|
|
47
|
+
RESOURCE_SUMMARIES: List[Dict[str, Any]] = []
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _get_logger() -> logging.Logger:
|
|
51
|
+
return logging.getLogger("ModuComp")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _log_lines(logger: logging.Logger, message: Union[str, List[str]], level: int = logging.INFO) -> None:
|
|
55
|
+
if isinstance(message, (list, tuple)):
|
|
56
|
+
lines = message
|
|
57
|
+
else:
|
|
58
|
+
lines = str(message).splitlines()
|
|
59
|
+
for line in lines:
|
|
60
|
+
if line.strip():
|
|
61
|
+
logger.log(level, line)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _log_or_print(
|
|
65
|
+
message: str,
|
|
66
|
+
logger: Optional[logging.Logger] = None,
|
|
67
|
+
level: int = logging.INFO,
|
|
68
|
+
color: str = "white",
|
|
69
|
+
err: bool = False,
|
|
70
|
+
verbose: bool = True,
|
|
71
|
+
force: bool = False,
|
|
72
|
+
) -> None:
|
|
73
|
+
logger = logger or _get_logger()
|
|
74
|
+
has_handlers = bool(getattr(logger, "handlers", []))
|
|
75
|
+
|
|
76
|
+
if level < logging.WARNING and not verbose and not force:
|
|
77
|
+
if has_handlers:
|
|
78
|
+
_log_lines(logger, message, logging.DEBUG)
|
|
79
|
+
return
|
|
80
|
+
|
|
81
|
+
if has_handlers:
|
|
82
|
+
_log_lines(logger, message, level)
|
|
83
|
+
else:
|
|
84
|
+
typer.secho(message, fg=color, err=err)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def log_info(message: str, logger: Optional[logging.Logger] = None) -> None:
|
|
88
|
+
_log_or_print(message, logger=logger, level=logging.INFO, color="white", err=False, force=True)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def log_warning(message: str, logger: Optional[logging.Logger] = None) -> None:
|
|
92
|
+
_log_or_print(message, logger=logger, level=logging.WARNING, color="yellow", err=True, force=True)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def log_error(message: str, logger: Optional[logging.Logger] = None) -> None:
|
|
96
|
+
_log_or_print(message, logger=logger, level=logging.ERROR, color="red", err=True, force=True)
|
|
97
|
+
|
|
47
98
|
def _data_roots() -> List[Path]:
|
|
48
99
|
roots: List[Path] = []
|
|
49
100
|
env_root = os.environ.get("MODUCOMP_DATA_DIR")
|
|
@@ -123,31 +174,22 @@ def require_eggnog_data_dir(eggnog_data_dir: Optional[str], logger: Optional[log
|
|
|
123
174
|
return data_dir
|
|
124
175
|
def conditional_output(message: str, color: str = "white", verbose: bool = True) -> None:
|
|
125
176
|
"""
|
|
126
|
-
|
|
177
|
+
Emit a progress message with optional verbosity gating.
|
|
127
178
|
|
|
128
179
|
Parameters
|
|
129
180
|
----------
|
|
130
181
|
message : str
|
|
131
182
|
Message to display
|
|
132
183
|
color : str, optional
|
|
133
|
-
Color for the message
|
|
184
|
+
Color for the message when falling back to console output
|
|
134
185
|
verbose : bool, optional
|
|
135
|
-
Whether to display the message
|
|
186
|
+
Whether to display the message at INFO level (otherwise DEBUG)
|
|
136
187
|
"""
|
|
137
|
-
|
|
138
|
-
return
|
|
139
|
-
logger = logging.getLogger("ModuComp")
|
|
140
|
-
if logger.handlers:
|
|
141
|
-
logger.info(message)
|
|
142
|
-
else:
|
|
143
|
-
typer.secho(message, fg=color)
|
|
188
|
+
_log_or_print(message, level=logging.INFO, color=color, err=False, verbose=verbose, force=False)
|
|
144
189
|
|
|
145
190
|
def emit_error(message: str, logger: Optional[logging.Logger] = None) -> None:
|
|
146
|
-
"""Log and emit an error to
|
|
147
|
-
|
|
148
|
-
logger.error(message)
|
|
149
|
-
typer.secho(f"ERROR: {message}", fg="red", err=True)
|
|
150
|
-
typer.secho(f"ERROR: {message}", fg="red")
|
|
191
|
+
"""Log and emit an error to stderr."""
|
|
192
|
+
log_error(message, logger=logger)
|
|
151
193
|
|
|
152
194
|
|
|
153
195
|
def format_bytes(num_bytes: float) -> str:
|
|
@@ -222,15 +264,20 @@ def run_subprocess_with_logging(
|
|
|
222
264
|
logger.debug("Working directory: %s", os.getcwd())
|
|
223
265
|
|
|
224
266
|
try:
|
|
267
|
+
output_level = logging.INFO if verbose else logging.DEBUG
|
|
268
|
+
error_level = logging.WARNING if verbose else logging.DEBUG
|
|
269
|
+
|
|
225
270
|
def stream_reader(stream, q, stream_type):
|
|
226
|
-
"""Read from stream and put lines in queue"""
|
|
271
|
+
"""Read from stream and put lines in queue."""
|
|
227
272
|
try:
|
|
228
273
|
while True:
|
|
229
274
|
line = stream.readline()
|
|
230
275
|
if not line:
|
|
231
276
|
break
|
|
232
|
-
line = line.rstrip(
|
|
233
|
-
|
|
277
|
+
line = line.rstrip("\n\r")
|
|
278
|
+
if not line:
|
|
279
|
+
continue
|
|
280
|
+
q.put((stream_type, line))
|
|
234
281
|
stream.close()
|
|
235
282
|
except Exception:
|
|
236
283
|
pass
|
|
@@ -282,13 +329,10 @@ def run_subprocess_with_logging(
|
|
|
282
329
|
stream_type, line = stdout_queue.get_nowait()
|
|
283
330
|
if stream_type == 'stdout':
|
|
284
331
|
stdout_lines.append(line)
|
|
285
|
-
# Stream to console immediately
|
|
286
|
-
if verbose:
|
|
287
|
-
print(line, flush=True)
|
|
288
332
|
if logger:
|
|
289
|
-
logger
|
|
290
|
-
|
|
291
|
-
|
|
333
|
+
_log_lines(logger, line, output_level)
|
|
334
|
+
elif verbose:
|
|
335
|
+
print(line, flush=True)
|
|
292
336
|
last_output_time = current_time
|
|
293
337
|
output_received = True
|
|
294
338
|
except queue.Empty:
|
|
@@ -299,13 +343,10 @@ def run_subprocess_with_logging(
|
|
|
299
343
|
stream_type, line = stderr_queue.get_nowait()
|
|
300
344
|
if stream_type == 'stderr':
|
|
301
345
|
stderr_lines.append(line)
|
|
302
|
-
# Stream to console immediately
|
|
303
|
-
if verbose:
|
|
304
|
-
print(line, file=sys.stderr, flush=True)
|
|
305
346
|
if logger:
|
|
306
|
-
logger
|
|
307
|
-
|
|
308
|
-
|
|
347
|
+
_log_lines(logger, line, error_level)
|
|
348
|
+
elif verbose:
|
|
349
|
+
print(line, file=sys.stderr, flush=True)
|
|
309
350
|
last_output_time = current_time
|
|
310
351
|
output_received = True
|
|
311
352
|
except queue.Empty:
|
|
@@ -314,10 +355,11 @@ def run_subprocess_with_logging(
|
|
|
314
355
|
# Show progress message if no output for a while
|
|
315
356
|
if not output_received and current_time - last_output_time > progress_interval:
|
|
316
357
|
elapsed = int(current_time - last_output_time)
|
|
317
|
-
|
|
318
|
-
print(f" ... still running (no output for {elapsed}s)", flush=True)
|
|
358
|
+
message = f"Process still running, no output for {elapsed} seconds"
|
|
319
359
|
if logger:
|
|
320
|
-
logger
|
|
360
|
+
_log_lines(logger, message, output_level)
|
|
361
|
+
elif verbose:
|
|
362
|
+
print(message, flush=True)
|
|
321
363
|
last_output_time = current_time
|
|
322
364
|
|
|
323
365
|
# Small delay to prevent busy waiting
|
|
@@ -333,10 +375,10 @@ def run_subprocess_with_logging(
|
|
|
333
375
|
stream_type, line = stdout_queue.get_nowait()
|
|
334
376
|
if stream_type == 'stdout':
|
|
335
377
|
stdout_lines.append(line)
|
|
336
|
-
if verbose:
|
|
337
|
-
print(line, flush=True)
|
|
338
378
|
if logger:
|
|
339
|
-
logger
|
|
379
|
+
_log_lines(logger, line, output_level)
|
|
380
|
+
elif verbose:
|
|
381
|
+
print(line, flush=True)
|
|
340
382
|
except queue.Empty:
|
|
341
383
|
break
|
|
342
384
|
|
|
@@ -345,10 +387,10 @@ def run_subprocess_with_logging(
|
|
|
345
387
|
stream_type, line = stderr_queue.get_nowait()
|
|
346
388
|
if stream_type == 'stderr':
|
|
347
389
|
stderr_lines.append(line)
|
|
348
|
-
if verbose:
|
|
349
|
-
print(line, file=sys.stderr, flush=True)
|
|
350
390
|
if logger:
|
|
351
|
-
logger
|
|
391
|
+
_log_lines(logger, line, error_level)
|
|
392
|
+
elif verbose:
|
|
393
|
+
print(line, file=sys.stderr, flush=True)
|
|
352
394
|
except queue.Empty:
|
|
353
395
|
break
|
|
354
396
|
|
|
@@ -363,9 +405,7 @@ def run_subprocess_with_logging(
|
|
|
363
405
|
|
|
364
406
|
except Exception as e:
|
|
365
407
|
error_msg = f"Exception running command {' '.join(cmd)}: {str(e)}"
|
|
366
|
-
|
|
367
|
-
logger.error(error_msg)
|
|
368
|
-
print(f"ERROR: {error_msg}", file=sys.stderr)
|
|
408
|
+
log_error(error_msg, logger=logger)
|
|
369
409
|
return -1, "", str(e)
|
|
370
410
|
|
|
371
411
|
|
|
@@ -494,23 +534,33 @@ def run_subprocess_with_resource_monitoring(
|
|
|
494
534
|
f.write(f"# {key}: {value}\n")
|
|
495
535
|
f.write("\n")
|
|
496
536
|
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
537
|
+
RESOURCE_SUMMARIES.append(
|
|
538
|
+
{
|
|
539
|
+
"description": description,
|
|
540
|
+
"command": cmd_str,
|
|
541
|
+
"elapsed_seconds": elapsed_seconds,
|
|
542
|
+
"user_time": user_time,
|
|
543
|
+
"system_time": system_time,
|
|
544
|
+
"cpu_percent": cpu_percent,
|
|
545
|
+
"max_ram_gb": max_ram_gb_str,
|
|
546
|
+
"exit_status": exit_status,
|
|
547
|
+
}
|
|
548
|
+
)
|
|
506
549
|
|
|
507
550
|
if logger:
|
|
508
|
-
logger.
|
|
551
|
+
logger.debug(
|
|
552
|
+
"Resource usage recorded for %s (wall=%ss, cpu=%s, peak_ram=%s GB).",
|
|
553
|
+
description,
|
|
554
|
+
elapsed_seconds,
|
|
555
|
+
cpu_percent,
|
|
556
|
+
max_ram_gb_str,
|
|
557
|
+
)
|
|
509
558
|
|
|
510
559
|
except Exception as e:
|
|
511
560
|
if logger:
|
|
512
|
-
logger.warning(
|
|
513
|
-
|
|
561
|
+
logger.warning("Failed to parse resource usage: %s", str(e))
|
|
562
|
+
else:
|
|
563
|
+
log_warning(f"Failed to parse resource usage: {str(e)}")
|
|
514
564
|
|
|
515
565
|
# Clean up temporary file
|
|
516
566
|
try:
|
|
@@ -521,7 +571,8 @@ def run_subprocess_with_resource_monitoring(
|
|
|
521
571
|
else:
|
|
522
572
|
if logger:
|
|
523
573
|
logger.warning("Resource monitoring file not found")
|
|
524
|
-
|
|
574
|
+
else:
|
|
575
|
+
log_warning("Resource monitoring output not found")
|
|
525
576
|
|
|
526
577
|
return returncode, stdout, stderr
|
|
527
578
|
|
|
@@ -549,14 +600,53 @@ def log_final_resource_summary(resource_log_file: str, total_start_time: float,
|
|
|
549
600
|
f.write(f"Pipeline completed at: {end_time.strftime('%Y-%m-%d %H:%M:%S')}\n")
|
|
550
601
|
f.write(f"Total pipeline elapsed time: {total_elapsed:.2f} seconds ({total_elapsed/60:.2f} minutes)\n")
|
|
551
602
|
|
|
552
|
-
if verbose:
|
|
553
|
-
conditional_output("Resource usage summary saved.", "green", verbose)
|
|
554
|
-
conditional_output(f"Resource log: {resource_log_file}", "white", verbose)
|
|
555
|
-
conditional_output(f"Total pipeline time: {total_elapsed:.2f}s ({total_elapsed/60:.2f}min)", "white", verbose)
|
|
556
|
-
|
|
557
603
|
if logger:
|
|
558
|
-
|
|
559
|
-
|
|
604
|
+
_log_lines(
|
|
605
|
+
logger,
|
|
606
|
+
[
|
|
607
|
+
"Resource usage summary completed.",
|
|
608
|
+
f"Resource log: {resource_log_file}",
|
|
609
|
+
f"Total pipeline time: {total_elapsed:.2f}s ({total_elapsed/60:.2f}min)",
|
|
610
|
+
],
|
|
611
|
+
logging.INFO,
|
|
612
|
+
)
|
|
613
|
+
log_resource_usage_summary(logger)
|
|
614
|
+
else:
|
|
615
|
+
_log_or_print(
|
|
616
|
+
f"Resource log: {resource_log_file}",
|
|
617
|
+
level=logging.INFO,
|
|
618
|
+
verbose=verbose,
|
|
619
|
+
force=True,
|
|
620
|
+
)
|
|
621
|
+
|
|
622
|
+
|
|
623
|
+
def log_resource_usage_summary(logger: Optional[logging.Logger] = None) -> None:
|
|
624
|
+
"""Log a per-command resource usage summary at the end of the pipeline."""
|
|
625
|
+
if not RESOURCE_SUMMARIES:
|
|
626
|
+
return
|
|
627
|
+
logger = logger or _get_logger()
|
|
628
|
+
if not getattr(logger, "handlers", []):
|
|
629
|
+
return
|
|
630
|
+
|
|
631
|
+
_log_lines(logger, "Resource usage summary (per command):", logging.INFO)
|
|
632
|
+
for entry in RESOURCE_SUMMARIES:
|
|
633
|
+
description = entry.get("description", "Command")
|
|
634
|
+
wall = entry.get("elapsed_seconds", "N/A")
|
|
635
|
+
user_time = entry.get("user_time", "N/A")
|
|
636
|
+
system_time = entry.get("system_time", "N/A")
|
|
637
|
+
cpu = entry.get("cpu_percent", "N/A")
|
|
638
|
+
ram = entry.get("max_ram_gb", "N/A")
|
|
639
|
+
exit_status = entry.get("exit_status", "N/A")
|
|
640
|
+
wall_display = f"{wall}s" if wall not in ("N/A", None, "") else "N/A"
|
|
641
|
+
user_display = f"{user_time}s" if user_time not in ("N/A", None, "") else "N/A"
|
|
642
|
+
system_display = f"{system_time}s" if system_time not in ("N/A", None, "") else "N/A"
|
|
643
|
+
ram_display = f"{ram} GB" if ram not in ("N/A", None, "") else "N/A"
|
|
644
|
+
line = (
|
|
645
|
+
f" - {description}: wall={wall_display}, user={user_display}, "
|
|
646
|
+
f"system={system_display}, cpu={cpu}, peak_ram={ram_display}, "
|
|
647
|
+
f"exit={exit_status}"
|
|
648
|
+
)
|
|
649
|
+
_log_lines(logger, line, logging.INFO)
|
|
560
650
|
|
|
561
651
|
|
|
562
652
|
def display_pipeline_completion_summary(start_time: float, savedir: str, logger: Optional[logging.Logger] = None, verbose: bool = True) -> None:
|
|
@@ -605,19 +695,19 @@ def display_pipeline_completion_summary(start_time: float, savedir: str, logger:
|
|
|
605
695
|
if complementarity_files > 0:
|
|
606
696
|
output_files.append(f"{complementarity_files} complementarity report(s)")
|
|
607
697
|
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
698
|
+
summary_lines = [
|
|
699
|
+
"Pipeline completed.",
|
|
700
|
+
f"Total execution time: {time_str} ({total_elapsed:.2f} seconds)",
|
|
701
|
+
f"Output directory: {savedir}",
|
|
702
|
+
f"Generated files: {', '.join(output_files) if output_files else 'None'}",
|
|
703
|
+
f"Completed at: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
|
|
704
|
+
]
|
|
614
705
|
|
|
615
706
|
if logger:
|
|
616
|
-
logger
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
logger.info(f"Completed at: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
707
|
+
_log_lines(logger, summary_lines, logging.INFO)
|
|
708
|
+
else:
|
|
709
|
+
for line in summary_lines:
|
|
710
|
+
_log_or_print(line, level=logging.INFO, verbose=verbose, force=True)
|
|
621
711
|
|
|
622
712
|
|
|
623
713
|
def parse_emapper_annotations(emapper_file_path: str, logger: Optional[logging.Logger] = None) -> Dict[str, Dict[str, List[str]]]:
|
|
@@ -836,8 +926,9 @@ def configure_logging(log_level: str, log_dir: Union[str, Path]) -> logging.Logg
|
|
|
836
926
|
|
|
837
927
|
logger = logging.getLogger("ModuComp")
|
|
838
928
|
numeric_level = getattr(logging, log_level.upper(), logging.INFO)
|
|
839
|
-
logger.setLevel(
|
|
929
|
+
logger.setLevel(logging.DEBUG)
|
|
840
930
|
logger.handlers.clear()
|
|
931
|
+
logger.propagate = False
|
|
841
932
|
|
|
842
933
|
formatter = logging.Formatter(
|
|
843
934
|
fmt="%(asctime)s %(levelname)s %(name)s: %(message)s",
|
|
@@ -848,12 +939,22 @@ def configure_logging(log_level: str, log_dir: Union[str, Path]) -> logging.Logg
|
|
|
848
939
|
file_handler.setLevel(logging.DEBUG)
|
|
849
940
|
file_handler.setFormatter(formatter)
|
|
850
941
|
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
942
|
+
class _BelowWarningFilter(logging.Filter):
|
|
943
|
+
def filter(self, record: logging.LogRecord) -> bool:
|
|
944
|
+
return record.levelno < logging.WARNING
|
|
945
|
+
|
|
946
|
+
stdout_handler = logging.StreamHandler(stream=sys.stdout)
|
|
947
|
+
stdout_handler.setLevel(numeric_level)
|
|
948
|
+
stdout_handler.setFormatter(formatter)
|
|
949
|
+
stdout_handler.addFilter(_BelowWarningFilter())
|
|
950
|
+
|
|
951
|
+
stderr_handler = logging.StreamHandler(stream=sys.stderr)
|
|
952
|
+
stderr_handler.setLevel(logging.WARNING)
|
|
953
|
+
stderr_handler.setFormatter(formatter)
|
|
854
954
|
|
|
855
955
|
logger.addHandler(file_handler)
|
|
856
|
-
logger.addHandler(
|
|
956
|
+
logger.addHandler(stdout_handler)
|
|
957
|
+
logger.addHandler(stderr_handler)
|
|
857
958
|
logger.debug("Logging initialised at level %s", logging.getLevelName(numeric_level))
|
|
858
959
|
logger.info(f"Log file created at: {log_file}")
|
|
859
960
|
return logger
|
|
@@ -903,10 +1004,9 @@ def how_many_genomes(genomedir: str, verbose: bool = True):
|
|
|
903
1004
|
"""
|
|
904
1005
|
n_files = len(get_path_to_each_genome(genomedir))
|
|
905
1006
|
if n_files > 0:
|
|
906
|
-
conditional_output(f"OK: {n_files} faa files were found in '{genomedir}'
|
|
1007
|
+
conditional_output(f"OK: {n_files} faa files were found in '{genomedir}'", "white", verbose)
|
|
907
1008
|
else:
|
|
908
|
-
|
|
909
|
-
typer.secho(f"ERROR: No FAA files were found in '{genomedir}'\n", fg="red")
|
|
1009
|
+
log_error(f"No FAA files were found in '{genomedir}'")
|
|
910
1010
|
exit()
|
|
911
1011
|
|
|
912
1012
|
|
|
@@ -921,12 +1021,12 @@ def create_output_dir(savedir: str, verbose: bool = True):
|
|
|
921
1021
|
verbose : bool
|
|
922
1022
|
Whether to display detailed output
|
|
923
1023
|
"""
|
|
924
|
-
conditional_output("
|
|
1024
|
+
conditional_output("Creating output directory", "green", verbose)
|
|
925
1025
|
if os.path.exists(savedir):
|
|
926
|
-
conditional_output(f"OK: Output directory already exists at: {savedir}
|
|
1026
|
+
conditional_output(f"OK: Output directory already exists at: {savedir}", "white", verbose)
|
|
927
1027
|
else:
|
|
928
1028
|
os.makedirs(savedir, exist_ok=True)
|
|
929
|
-
conditional_output(f"OK: Output directory created at: {savedir}
|
|
1029
|
+
conditional_output(f"OK: Output directory created at: {savedir}", "white", verbose)
|
|
930
1030
|
|
|
931
1031
|
|
|
932
1032
|
def get_tmp_dir(savedir:str) -> str:
|
|
@@ -958,13 +1058,13 @@ def create_tmp_dir(savedir: str, verbose: bool = True):
|
|
|
958
1058
|
verbose : bool
|
|
959
1059
|
Whether to display detailed output
|
|
960
1060
|
"""
|
|
961
|
-
conditional_output("
|
|
1061
|
+
conditional_output("Creating tmp dir", "green", verbose)
|
|
962
1062
|
tmp_dir_path = get_tmp_dir(savedir)
|
|
963
1063
|
if (os.path.exists(tmp_dir_path)):
|
|
964
|
-
conditional_output(f"OK: Tmp directory already exists at: {tmp_dir_path}
|
|
1064
|
+
conditional_output(f"OK: Tmp directory already exists at: {tmp_dir_path}", "white", verbose)
|
|
965
1065
|
else:
|
|
966
1066
|
os.mkdir(tmp_dir_path)
|
|
967
|
-
conditional_output(f"OK: Tmp directory created at: {tmp_dir_path}
|
|
1067
|
+
conditional_output(f"OK: Tmp directory created at: {tmp_dir_path}", "white", verbose)
|
|
968
1068
|
|
|
969
1069
|
|
|
970
1070
|
def adapt_fasta_headers(genomedir: str, savedir: str, verbose: bool = True) -> None:
|
|
@@ -985,11 +1085,11 @@ def adapt_fasta_headers(genomedir: str, savedir: str, verbose: bool = True) -> N
|
|
|
985
1085
|
verbose : bool
|
|
986
1086
|
Whether to display detailed output
|
|
987
1087
|
"""
|
|
988
|
-
conditional_output("
|
|
1088
|
+
conditional_output("Modifying fasta headers", "green", verbose)
|
|
989
1089
|
path_to_each_genome = get_path_to_each_genome(genomedir)
|
|
990
1090
|
output_dir = f"{get_tmp_dir(savedir)}/faa"
|
|
991
1091
|
if os.path.exists(output_dir):
|
|
992
|
-
conditional_output(f"OK: Fasta headers already modified at: {output_dir}
|
|
1092
|
+
conditional_output(f"OK: Fasta headers already modified at: {output_dir}", "white", verbose)
|
|
993
1093
|
return
|
|
994
1094
|
|
|
995
1095
|
os.mkdir(output_dir)
|
|
@@ -1005,7 +1105,7 @@ def adapt_fasta_headers(genomedir: str, savedir: str, verbose: bool = True) -> N
|
|
|
1005
1105
|
i+=1
|
|
1006
1106
|
else:
|
|
1007
1107
|
outfile.write(line)
|
|
1008
|
-
conditional_output(f"OK: Fasta headers modified at: {output_dir}
|
|
1108
|
+
conditional_output(f"OK: Fasta headers modified at: {output_dir}", "white", verbose)
|
|
1009
1109
|
|
|
1010
1110
|
|
|
1011
1111
|
def copy_faa_to_tmp(genomedir: str, savedir: str, verbose: bool = True) -> None:
|
|
@@ -1025,18 +1125,18 @@ def copy_faa_to_tmp(genomedir: str, savedir: str, verbose: bool = True) -> None:
|
|
|
1025
1125
|
verbose : bool
|
|
1026
1126
|
Whether to display detailed output
|
|
1027
1127
|
"""
|
|
1028
|
-
conditional_output("
|
|
1128
|
+
conditional_output("Copying faa files to tmp dir", "green", verbose)
|
|
1029
1129
|
path_to_each_genome = get_path_to_each_genome(genomedir)
|
|
1030
1130
|
output_dir = f"{get_tmp_dir(savedir)}/faa"
|
|
1031
1131
|
if os.path.exists(output_dir):
|
|
1032
|
-
conditional_output(f"OK: Fasta files already exist at: {output_dir}
|
|
1132
|
+
conditional_output(f"OK: Fasta files already exist at: {output_dir}", "white", verbose)
|
|
1033
1133
|
return
|
|
1034
1134
|
|
|
1035
1135
|
os.mkdir(output_dir)
|
|
1036
1136
|
conditional_output("Copying genome files to temporary directory...", "yellow", verbose)
|
|
1037
1137
|
for each_file in path_to_each_genome:
|
|
1038
1138
|
shutil.copy(each_file, output_dir)
|
|
1039
|
-
conditional_output(f"OK: Fasta files copied to: {output_dir}
|
|
1139
|
+
conditional_output(f"OK: Fasta files copied to: {output_dir}", "white", verbose)
|
|
1040
1140
|
|
|
1041
1141
|
|
|
1042
1142
|
def merge_genomes(savedir: str, logger: Optional[logging.Logger] = None, verbose: bool = True) -> bool:
|
|
@@ -1057,13 +1157,13 @@ def merge_genomes(savedir: str, logger: Optional[logging.Logger] = None, verbose
|
|
|
1057
1157
|
bool
|
|
1058
1158
|
True if the merged file was created or already exists, False otherwise
|
|
1059
1159
|
"""
|
|
1060
|
-
conditional_output("
|
|
1160
|
+
conditional_output("Merging genomes", "green", verbose)
|
|
1061
1161
|
genome_file_paths = glob.glob(f"{get_tmp_dir(savedir)}/faa/*.faa")
|
|
1062
1162
|
output_file = f"{get_tmp_dir(savedir)}/merged_genomes.faa"
|
|
1063
1163
|
|
|
1064
1164
|
|
|
1065
1165
|
if os.path.exists(output_file):
|
|
1066
|
-
conditional_output(f"OK: Merged genomes file already exists at: {output_file}
|
|
1166
|
+
conditional_output(f"OK: Merged genomes file already exists at: {output_file}", "white", verbose)
|
|
1067
1167
|
if logger:
|
|
1068
1168
|
logger.info(f"Using existing merged genomes file: {output_file}")
|
|
1069
1169
|
return True
|
|
@@ -1071,10 +1171,7 @@ def merge_genomes(savedir: str, logger: Optional[logging.Logger] = None, verbose
|
|
|
1071
1171
|
|
|
1072
1172
|
if not genome_file_paths:
|
|
1073
1173
|
error_msg = f"No FAA files found in {get_tmp_dir(savedir)}/faa/"
|
|
1074
|
-
|
|
1075
|
-
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
1076
|
-
if logger:
|
|
1077
|
-
logger.error(error_msg)
|
|
1174
|
+
log_error(error_msg, logger=logger)
|
|
1078
1175
|
return False
|
|
1079
1176
|
|
|
1080
1177
|
conditional_output("Merging individual genome files...", "yellow", verbose)
|
|
@@ -1084,16 +1181,13 @@ def merge_genomes(savedir: str, logger: Optional[logging.Logger] = None, verbose
|
|
|
1084
1181
|
with open(each_file) as infile:
|
|
1085
1182
|
for line in infile:
|
|
1086
1183
|
outfile.write(line)
|
|
1087
|
-
conditional_output(f"OK: Fasta files merged at: {output_file}
|
|
1184
|
+
conditional_output(f"OK: Fasta files merged at: {output_file}", "white", verbose)
|
|
1088
1185
|
if logger:
|
|
1089
1186
|
logger.info(f"Successfully created merged genome file: {output_file}")
|
|
1090
1187
|
return True
|
|
1091
1188
|
except Exception as e:
|
|
1092
1189
|
error_msg = f"Error merging genome files: {str(e)}"
|
|
1093
|
-
|
|
1094
|
-
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
1095
|
-
if logger:
|
|
1096
|
-
logger.error(error_msg)
|
|
1190
|
+
log_error(error_msg, logger=logger)
|
|
1097
1191
|
return False
|
|
1098
1192
|
|
|
1099
1193
|
|
|
@@ -1117,12 +1211,16 @@ def run_emapper(savedir: str, ncpus: int, resource_log_file: str, lowmem: bool =
|
|
|
1117
1211
|
bool
|
|
1118
1212
|
True if emapper ran successfully or outputs already exist, False otherwise
|
|
1119
1213
|
"""
|
|
1120
|
-
conditional_output("
|
|
1214
|
+
conditional_output("Starting eggNOG-mapper", "green", verbose)
|
|
1121
1215
|
|
|
1122
1216
|
|
|
1123
1217
|
final_emapper_annotation_file = f"{savedir}/emapper_out.emapper.annotations"
|
|
1124
1218
|
if os.path.exists(final_emapper_annotation_file):
|
|
1125
|
-
|
|
1219
|
+
conditional_output(
|
|
1220
|
+
f"OK: Emapper annotations already exist at: {final_emapper_annotation_file}",
|
|
1221
|
+
"white",
|
|
1222
|
+
verbose,
|
|
1223
|
+
)
|
|
1126
1224
|
if logger:
|
|
1127
1225
|
logger.info(f"Using existing emapper annotations: {final_emapper_annotation_file}")
|
|
1128
1226
|
return True
|
|
@@ -1135,14 +1233,16 @@ def run_emapper(savedir: str, ncpus: int, resource_log_file: str, lowmem: bool =
|
|
|
1135
1233
|
|
|
1136
1234
|
if not os.path.exists(merged_genomes_file):
|
|
1137
1235
|
error_msg = f"Merged genomes file not found at: {merged_genomes_file}"
|
|
1138
|
-
|
|
1139
|
-
if logger:
|
|
1140
|
-
logger.error(error_msg)
|
|
1236
|
+
log_error(error_msg, logger=logger)
|
|
1141
1237
|
return False
|
|
1142
1238
|
|
|
1143
1239
|
|
|
1144
1240
|
if os.path.exists(emapper_tmp_file):
|
|
1145
|
-
|
|
1241
|
+
conditional_output(
|
|
1242
|
+
f"OK: Emapper output already exists at: {emapper_tmp_file}",
|
|
1243
|
+
"white",
|
|
1244
|
+
verbose,
|
|
1245
|
+
)
|
|
1146
1246
|
if logger:
|
|
1147
1247
|
logger.info(f"Using existing emapper output from temporary directory: {emapper_tmp_file}")
|
|
1148
1248
|
|
|
@@ -1185,41 +1285,40 @@ def run_emapper(savedir: str, ncpus: int, resource_log_file: str, lowmem: bool =
|
|
|
1185
1285
|
error_msg = f"emapper failed with return code {returncode}"
|
|
1186
1286
|
if stderr:
|
|
1187
1287
|
error_msg += f": {stderr}"
|
|
1188
|
-
|
|
1189
|
-
if logger:
|
|
1190
|
-
logger.error(error_msg)
|
|
1288
|
+
log_error(error_msg, logger=logger)
|
|
1191
1289
|
return False
|
|
1192
1290
|
|
|
1193
1291
|
|
|
1194
1292
|
if logger and stdout:
|
|
1195
|
-
|
|
1293
|
+
summary = stdout[:500] + ("..." if len(stdout) > 500 else "")
|
|
1294
|
+
_log_lines(logger, f"emapper stdout summary:\n{summary}", logging.INFO)
|
|
1196
1295
|
|
|
1197
1296
|
|
|
1198
1297
|
if not os.path.exists(emapper_tmp_file):
|
|
1199
1298
|
error_msg = f"emapper did not generate expected output: {emapper_tmp_file}"
|
|
1200
|
-
|
|
1201
|
-
if logger:
|
|
1202
|
-
logger.error(error_msg)
|
|
1299
|
+
log_error(error_msg, logger=logger)
|
|
1203
1300
|
return False
|
|
1204
1301
|
|
|
1205
1302
|
|
|
1206
1303
|
shutil.copy(emapper_tmp_file, final_emapper_annotation_file)
|
|
1207
1304
|
|
|
1208
|
-
|
|
1209
|
-
|
|
1305
|
+
conditional_output(f"OK: emapper output saved at: {output_folder_emapper}", "white", verbose)
|
|
1306
|
+
conditional_output(
|
|
1307
|
+
f"OK: emapper annotations copied to: {final_emapper_annotation_file}",
|
|
1308
|
+
"white",
|
|
1309
|
+
verbose,
|
|
1310
|
+
)
|
|
1210
1311
|
if logger:
|
|
1211
1312
|
logger.info(f"Successfully ran emapper and saved annotations to: {final_emapper_annotation_file}")
|
|
1212
1313
|
return True
|
|
1213
1314
|
|
|
1214
1315
|
except Exception as e:
|
|
1215
1316
|
error_msg = f"Error running emapper: {str(e)}"
|
|
1216
|
-
|
|
1217
|
-
if logger:
|
|
1218
|
-
logger.error(error_msg)
|
|
1317
|
+
log_error(error_msg, logger=logger)
|
|
1219
1318
|
return False
|
|
1220
1319
|
|
|
1221
1320
|
|
|
1222
|
-
def remove_temp_files(savedir: str, logger: Optional[logging.Logger] = None) -> None:
|
|
1321
|
+
def remove_temp_files(savedir: str, logger: Optional[logging.Logger] = None, verbose: bool = True) -> None:
|
|
1223
1322
|
"""
|
|
1224
1323
|
Remove temporary files and directories.
|
|
1225
1324
|
|
|
@@ -1234,13 +1333,11 @@ def remove_temp_files(savedir: str, logger: Optional[logging.Logger] = None) ->
|
|
|
1234
1333
|
if os.path.exists(tmp_dir):
|
|
1235
1334
|
try:
|
|
1236
1335
|
shutil.rmtree(tmp_dir)
|
|
1237
|
-
|
|
1336
|
+
conditional_output(f"OK: Temporary files removed from: {tmp_dir}", "white", verbose)
|
|
1238
1337
|
if logger:
|
|
1239
1338
|
logger.info(f"Removed temporary directory: {tmp_dir}")
|
|
1240
1339
|
except Exception as e:
|
|
1241
|
-
|
|
1242
|
-
if logger:
|
|
1243
|
-
logger.warning(f"Failed to remove temporary directory {tmp_dir}: {str(e)}")
|
|
1340
|
+
log_warning(f"Failed to remove temporary directory {tmp_dir}: {str(e)}", logger=logger)
|
|
1244
1341
|
|
|
1245
1342
|
|
|
1246
1343
|
def check_final_reports_exist(savedir: str, calculate_complementarity: int, logger: Optional[logging.Logger] = None) -> bool:
|
|
@@ -1343,18 +1440,16 @@ def generate_complementarity_report(
|
|
|
1343
1440
|
|
|
1344
1441
|
|
|
1345
1442
|
if os.path.exists(output_file):
|
|
1443
|
+
conditional_output(f"OK: Complementarity report already exists at: {output_file}", "white", verbose)
|
|
1346
1444
|
if logger:
|
|
1347
1445
|
logger.info(f"Complementarity report already exists at {output_file}")
|
|
1348
|
-
conditional_output(f"OK: Complementarity report already exists at: {output_file}", "white", verbose)
|
|
1349
1446
|
return
|
|
1350
1447
|
|
|
1351
1448
|
|
|
1352
1449
|
module_matrix_file = f"{savedir}/module_completeness.tsv"
|
|
1353
1450
|
if not os.path.exists(module_matrix_file):
|
|
1354
1451
|
error_msg = f"Module completeness matrix not found at: {module_matrix_file}"
|
|
1355
|
-
|
|
1356
|
-
logger.error(error_msg)
|
|
1357
|
-
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
1452
|
+
log_error(error_msg, logger=logger)
|
|
1358
1453
|
return
|
|
1359
1454
|
|
|
1360
1455
|
|
|
@@ -1371,13 +1466,15 @@ def generate_complementarity_report(
|
|
|
1371
1466
|
emapper_file = possible_file
|
|
1372
1467
|
if logger:
|
|
1373
1468
|
logger.info(f"Found emapper annotation file at: {emapper_file}")
|
|
1374
|
-
|
|
1469
|
+
conditional_output(f"OK: Using emapper annotations from: {emapper_file}", "white", verbose)
|
|
1375
1470
|
break
|
|
1376
1471
|
|
|
1377
1472
|
if not emapper_file:
|
|
1378
|
-
|
|
1379
|
-
|
|
1380
|
-
|
|
1473
|
+
log_warning(
|
|
1474
|
+
"Emapper annotation file not found in any expected location. "
|
|
1475
|
+
"Will use placeholder protein IDs.",
|
|
1476
|
+
logger=logger,
|
|
1477
|
+
)
|
|
1381
1478
|
|
|
1382
1479
|
|
|
1383
1480
|
kpct_output_file = None
|
|
@@ -1395,9 +1492,7 @@ def generate_complementarity_report(
|
|
|
1395
1492
|
|
|
1396
1493
|
if not kpct_output_file:
|
|
1397
1494
|
error_msg = "KPCT output file not found. Cannot extract module metadata."
|
|
1398
|
-
|
|
1399
|
-
logger.error(error_msg)
|
|
1400
|
-
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
1495
|
+
log_error(error_msg, logger=logger)
|
|
1401
1496
|
return
|
|
1402
1497
|
|
|
1403
1498
|
try:
|
|
@@ -1476,10 +1571,13 @@ def generate_complementarity_report(
|
|
|
1476
1571
|
error_msg = f"Cannot identify required module columns in KPCT output: {kpct_output_file}"
|
|
1477
1572
|
if logger:
|
|
1478
1573
|
logger.error(error_msg)
|
|
1479
|
-
if module_id_col:
|
|
1480
|
-
|
|
1574
|
+
if module_id_col:
|
|
1575
|
+
logger.error(f"Found module_id_col: {module_id_col}")
|
|
1576
|
+
if module_name_col:
|
|
1577
|
+
logger.error(f"Found module_name_col: {module_name_col}")
|
|
1481
1578
|
logger.error(f"Available columns: {kpct_df.columns.tolist()}")
|
|
1482
|
-
|
|
1579
|
+
else:
|
|
1580
|
+
log_error(error_msg, logger=logger)
|
|
1483
1581
|
return
|
|
1484
1582
|
|
|
1485
1583
|
if not contig_col or not matching_ko_col:
|
|
@@ -1487,7 +1585,10 @@ def generate_complementarity_report(
|
|
|
1487
1585
|
logger.warning(f"Cannot identify contig or matching_ko columns in KPCT output.")
|
|
1488
1586
|
logger.warning(f"Found contig_col: {contig_col}, matching_ko_col: {matching_ko_col}")
|
|
1489
1587
|
logger.warning(f"Available columns: {kpct_df.columns.tolist()}")
|
|
1490
|
-
|
|
1588
|
+
log_warning(
|
|
1589
|
+
"Missing columns in KPCT output may affect mapping of KOs to combinations.",
|
|
1590
|
+
logger=logger,
|
|
1591
|
+
)
|
|
1491
1592
|
|
|
1492
1593
|
|
|
1493
1594
|
module_metadata = {}
|
|
@@ -1749,10 +1850,6 @@ def generate_complementarity_report(
|
|
|
1749
1850
|
|
|
1750
1851
|
report_df.to_csv(output_file, sep='\t', index=False)
|
|
1751
1852
|
|
|
1752
|
-
if logger:
|
|
1753
|
-
logger.info(f"Found {len(report_df)} complementary modules in {n_members}-member combinations")
|
|
1754
|
-
logger.info(f"Complementarity report saved to: {output_file}")
|
|
1755
|
-
|
|
1756
1853
|
conditional_output(f"OK: Found {len(report_df)} complementary modules in {n_members}-member combinations", "green", verbose)
|
|
1757
1854
|
conditional_output(f"Complementarity report saved to: {output_file}", "white", verbose)
|
|
1758
1855
|
else:
|
|
@@ -1773,17 +1870,27 @@ def generate_complementarity_report(
|
|
|
1773
1870
|
report_df = pd.DataFrame(columns=columns)
|
|
1774
1871
|
report_df.to_csv(output_file, sep='\t', index=False)
|
|
1775
1872
|
|
|
1776
|
-
|
|
1777
|
-
|
|
1873
|
+
log_warning(
|
|
1874
|
+
f"No complementary modules found in {n_members}-member combinations",
|
|
1875
|
+
logger=logger,
|
|
1876
|
+
)
|
|
1877
|
+
conditional_output(f"Empty report saved to: {output_file}", "white", verbose)
|
|
1778
1878
|
|
|
1779
1879
|
except Exception as e:
|
|
1780
1880
|
error_msg = f"Error generating complementarity report: {str(e)}"
|
|
1781
1881
|
if logger:
|
|
1782
1882
|
logger.error(error_msg, exc_info=True)
|
|
1783
|
-
|
|
1883
|
+
else:
|
|
1884
|
+
log_error(error_msg, logger=logger)
|
|
1784
1885
|
|
|
1785
1886
|
|
|
1786
|
-
def ko_matrix_to_kpct_format(
|
|
1887
|
+
def ko_matrix_to_kpct_format(
|
|
1888
|
+
kos_matrix: str,
|
|
1889
|
+
savedir: str,
|
|
1890
|
+
calculate_complementarity: int = 0,
|
|
1891
|
+
logger: Optional[logging.Logger] = None,
|
|
1892
|
+
verbose: bool = True,
|
|
1893
|
+
) -> str:
|
|
1787
1894
|
"""
|
|
1788
1895
|
Convert KO matrix to KPCT format.
|
|
1789
1896
|
|
|
@@ -1812,7 +1919,7 @@ def ko_matrix_to_kpct_format(kos_matrix: str, savedir: str, calculate_complement
|
|
|
1812
1919
|
|
|
1813
1920
|
initial_delimiter = ',' if kos_matrix.lower().endswith('.csv') else '\t'
|
|
1814
1921
|
|
|
1815
|
-
|
|
1922
|
+
conditional_output(f"Reading KO matrix file: {kos_matrix}", "yellow", verbose)
|
|
1816
1923
|
if logger:
|
|
1817
1924
|
logger.info(f"Reading KO matrix file with delimiter '{initial_delimiter}': {kos_matrix}")
|
|
1818
1925
|
|
|
@@ -1865,9 +1972,7 @@ def ko_matrix_to_kpct_format(kos_matrix: str, savedir: str, calculate_complement
|
|
|
1865
1972
|
|
|
1866
1973
|
if 'taxon_oid' not in ko_df.columns:
|
|
1867
1974
|
msg = "Invalid KO matrix format: missing 'taxon_oid' column"
|
|
1868
|
-
|
|
1869
|
-
logger.error(msg)
|
|
1870
|
-
typer.secho(f"ERROR: {msg}", fg="red")
|
|
1975
|
+
log_error(msg, logger=logger)
|
|
1871
1976
|
exit(1)
|
|
1872
1977
|
|
|
1873
1978
|
|
|
@@ -1912,14 +2017,12 @@ def ko_matrix_to_kpct_format(kos_matrix: str, savedir: str, calculate_complement
|
|
|
1912
2017
|
|
|
1913
2018
|
if logger:
|
|
1914
2019
|
logger.info(f"KO matrix converted to KPCT format: {output_path}")
|
|
1915
|
-
|
|
2020
|
+
conditional_output(f"OK: KO matrix converted to KPCT format: {output_path}", "white", verbose)
|
|
1916
2021
|
return output_path
|
|
1917
2022
|
|
|
1918
2023
|
except Exception as e:
|
|
1919
2024
|
error_msg = f"Error converting KO matrix to KPCT format: {str(e)}"
|
|
1920
|
-
|
|
1921
|
-
logger.error(error_msg)
|
|
1922
|
-
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
2025
|
+
log_error(error_msg, logger=logger)
|
|
1923
2026
|
raise
|
|
1924
2027
|
|
|
1925
2028
|
|
|
@@ -1975,7 +2078,12 @@ def get_ko_protein_mappings_from_kpct_input(kpct_input_file: str, logger: Option
|
|
|
1975
2078
|
return {}
|
|
1976
2079
|
|
|
1977
2080
|
|
|
1978
|
-
def create_module_completeness_matrix(
|
|
2081
|
+
def create_module_completeness_matrix(
|
|
2082
|
+
savedir: str,
|
|
2083
|
+
kpct_outprefix: str,
|
|
2084
|
+
logger: Optional[logging.Logger] = None,
|
|
2085
|
+
verbose: bool = True,
|
|
2086
|
+
) -> None:
|
|
1979
2087
|
"""
|
|
1980
2088
|
Create a module completeness matrix from the KPCT output.
|
|
1981
2089
|
|
|
@@ -2007,9 +2115,7 @@ def create_module_completeness_matrix(savedir: str, kpct_outprefix: str, logger:
|
|
|
2007
2115
|
|
|
2008
2116
|
if not os.path.exists(kpct_output_file):
|
|
2009
2117
|
error_msg = f"KPCT output file not found: tried {kpct_outprefix}_contigs.with_weights.tsv and alternatives"
|
|
2010
|
-
|
|
2011
|
-
logger.error(error_msg)
|
|
2012
|
-
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
2118
|
+
log_error(error_msg, logger=logger)
|
|
2013
2119
|
return
|
|
2014
2120
|
|
|
2015
2121
|
try:
|
|
@@ -2109,7 +2215,8 @@ def create_module_completeness_matrix(savedir: str, kpct_outprefix: str, logger:
|
|
|
2109
2215
|
error_msg = "Could not identify module columns in the KPCT output"
|
|
2110
2216
|
if logger:
|
|
2111
2217
|
logger.error(error_msg)
|
|
2112
|
-
|
|
2218
|
+
else:
|
|
2219
|
+
log_error(error_msg, logger=logger)
|
|
2113
2220
|
return
|
|
2114
2221
|
|
|
2115
2222
|
# Build the result data
|
|
@@ -2152,19 +2259,22 @@ def create_module_completeness_matrix(savedir: str, kpct_outprefix: str, logger:
|
|
|
2152
2259
|
logger.info(f"Matrix contains {single_genomes} single genomes out of {total_genomes} total entries")
|
|
2153
2260
|
if all_genomes:
|
|
2154
2261
|
logger.info(f"Expected {len(all_genomes)} single genomes from KPCT input")
|
|
2155
|
-
|
|
2262
|
+
conditional_output(f"OK: Module completeness matrix saved to: {output_file}", "white", verbose)
|
|
2156
2263
|
|
|
2157
2264
|
except Exception as e:
|
|
2158
2265
|
error_msg = f"Error creating module completeness matrix: {str(e)}"
|
|
2159
2266
|
if logger:
|
|
2160
|
-
logger.error(error_msg)
|
|
2161
|
-
|
|
2162
|
-
|
|
2163
|
-
if logger:
|
|
2164
|
-
logger.error(f"Error details: {e}", exc_info=True)
|
|
2267
|
+
logger.error(error_msg, exc_info=True)
|
|
2268
|
+
else:
|
|
2269
|
+
log_error(error_msg, logger=logger)
|
|
2165
2270
|
|
|
2166
2271
|
|
|
2167
|
-
def create_ko_matrix_from_emapper_annotation(
|
|
2272
|
+
def create_ko_matrix_from_emapper_annotation(
|
|
2273
|
+
emapper_file_path: str,
|
|
2274
|
+
output_file_path: str,
|
|
2275
|
+
logger: Optional[logging.Logger] = None,
|
|
2276
|
+
verbose: bool = True,
|
|
2277
|
+
) -> None:
|
|
2168
2278
|
"""
|
|
2169
2279
|
Create a KO matrix from an eggNOG-mapper annotation file.
|
|
2170
2280
|
|
|
@@ -2203,24 +2313,22 @@ def create_ko_matrix_from_emapper_annotation(emapper_file_path: str, output_file
|
|
|
2203
2313
|
- Removes 'ko:' prefixes and weight annotations like '(0.5)'
|
|
2204
2314
|
- Skips rows with missing or '-' KO annotations
|
|
2205
2315
|
"""
|
|
2206
|
-
|
|
2316
|
+
conditional_output("Creating KO matrix from eggNOG-mapper annotations", "green", verbose)
|
|
2207
2317
|
|
|
2208
2318
|
|
|
2209
2319
|
if os.path.exists(output_file_path):
|
|
2210
|
-
|
|
2320
|
+
conditional_output(f"OK: KO matrix already exists at: {output_file_path}", "white", verbose)
|
|
2211
2321
|
if logger:
|
|
2212
2322
|
logger.info(f"KO matrix already exists at: {output_file_path}")
|
|
2213
2323
|
return
|
|
2214
2324
|
|
|
2215
2325
|
if not os.path.exists(emapper_file_path):
|
|
2216
2326
|
error_msg = f"eMapper annotation file not found at {emapper_file_path}. Cannot proceed."
|
|
2217
|
-
|
|
2218
|
-
logger.error(error_msg)
|
|
2219
|
-
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
2327
|
+
log_error(error_msg, logger=logger)
|
|
2220
2328
|
exit(1)
|
|
2221
2329
|
|
|
2222
2330
|
try:
|
|
2223
|
-
|
|
2331
|
+
conditional_output("Processing eggNOG-mapper annotations and extracting KO terms...", "yellow", verbose)
|
|
2224
2332
|
|
|
2225
2333
|
if logger:
|
|
2226
2334
|
logger.info(f"Reading eggNOG-mapper annotations from: {emapper_file_path}")
|
|
@@ -2280,13 +2388,11 @@ def create_ko_matrix_from_emapper_annotation(emapper_file_path: str, output_file
|
|
|
2280
2388
|
|
|
2281
2389
|
if not kos_data_for_matrix_df:
|
|
2282
2390
|
error_msg = "No KO data found in the eMapper annotations file"
|
|
2283
|
-
|
|
2284
|
-
logger.error(error_msg)
|
|
2285
|
-
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
2391
|
+
log_error(error_msg, logger=logger)
|
|
2286
2392
|
return
|
|
2287
2393
|
|
|
2288
2394
|
|
|
2289
|
-
|
|
2395
|
+
conditional_output("Creating KO count matrix (kos_matrix.csv)...", "yellow", verbose)
|
|
2290
2396
|
kos_count_df = pd.concat(kos_data_for_matrix_df)
|
|
2291
2397
|
|
|
2292
2398
|
|
|
@@ -2313,13 +2419,14 @@ def create_ko_matrix_from_emapper_annotation(emapper_file_path: str, output_file
|
|
|
2313
2419
|
logger.info(f"Created KO matrix with {len(kos_count_df)} genomes and {len(kos_count_df.columns)-1} KOs")
|
|
2314
2420
|
logger.info(f"KO matrix saved to: {output_file_path}")
|
|
2315
2421
|
|
|
2316
|
-
|
|
2422
|
+
conditional_output(f"OK: KO matrix created and saved to: {output_file_path}", "white", verbose)
|
|
2317
2423
|
|
|
2318
2424
|
except Exception as e:
|
|
2319
2425
|
error_msg = f"Error creating KO matrix: {str(e)}"
|
|
2320
2426
|
if logger:
|
|
2321
2427
|
logger.error(error_msg, exc_info=True)
|
|
2322
|
-
|
|
2428
|
+
else:
|
|
2429
|
+
log_error(error_msg, logger=logger)
|
|
2323
2430
|
exit(1)
|
|
2324
2431
|
|
|
2325
2432
|
|
|
@@ -2347,16 +2454,12 @@ def check_kpct_installed(logger: Optional[logging.Logger] = None) -> bool:
|
|
|
2347
2454
|
|
|
2348
2455
|
if give_completeness_check.returncode != 0:
|
|
2349
2456
|
error_msg = "KPCT 'give_completeness' tool not found in PATH. Please install it via pip: pip install kegg-pathways-completeness"
|
|
2350
|
-
|
|
2351
|
-
logger.error(error_msg)
|
|
2352
|
-
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
2457
|
+
log_error(error_msg, logger=logger)
|
|
2353
2458
|
return False
|
|
2354
2459
|
return True
|
|
2355
2460
|
except Exception as e:
|
|
2356
2461
|
error_msg = f"Error checking for KPCT installation: {str(e)}"
|
|
2357
|
-
|
|
2358
|
-
logger.error(error_msg)
|
|
2359
|
-
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
2462
|
+
log_error(error_msg, logger=logger)
|
|
2360
2463
|
return False
|
|
2361
2464
|
|
|
2362
2465
|
|
|
@@ -2441,9 +2544,7 @@ def chunk_kpct_input_file(kpct_input_file: str, savedir: str, n_chunks: int, log
|
|
|
2441
2544
|
|
|
2442
2545
|
if not lines:
|
|
2443
2546
|
error_msg = f"KPCT input file is empty: {kpct_input_file}"
|
|
2444
|
-
|
|
2445
|
-
logger.error(error_msg)
|
|
2446
|
-
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
2547
|
+
log_error(error_msg, logger=logger)
|
|
2447
2548
|
return []
|
|
2448
2549
|
|
|
2449
2550
|
# Calculate lines per chunk using ceiling division to ensure we create exactly n_chunks
|
|
@@ -2495,9 +2596,7 @@ def chunk_kpct_input_file(kpct_input_file: str, savedir: str, n_chunks: int, log
|
|
|
2495
2596
|
|
|
2496
2597
|
except Exception as e:
|
|
2497
2598
|
error_msg = f"Error chunking KPCT input file: {str(e)}"
|
|
2498
|
-
|
|
2499
|
-
logger.error(error_msg)
|
|
2500
|
-
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
2599
|
+
log_error(error_msg, logger=logger)
|
|
2501
2600
|
return []
|
|
2502
2601
|
|
|
2503
2602
|
|
|
@@ -2636,13 +2735,18 @@ def concatenate_kpct_outputs(chunk_dirs: List[str], savedir: str, kpct_outprefix
|
|
|
2636
2735
|
|
|
2637
2736
|
except Exception as e:
|
|
2638
2737
|
error_msg = f"Error concatenating KPCT outputs: {str(e)}"
|
|
2639
|
-
|
|
2640
|
-
logger.error(error_msg)
|
|
2641
|
-
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
2738
|
+
log_error(error_msg, logger=logger)
|
|
2642
2739
|
return False
|
|
2643
2740
|
|
|
2644
2741
|
|
|
2645
|
-
def run_kpct_parallel(
|
|
2742
|
+
def run_kpct_parallel(
|
|
2743
|
+
kpct_input_file: str,
|
|
2744
|
+
savedir: str,
|
|
2745
|
+
kpct_outprefix: str,
|
|
2746
|
+
ncpus: int,
|
|
2747
|
+
logger: Optional[logging.Logger] = None,
|
|
2748
|
+
verbose: bool = True,
|
|
2749
|
+
) -> bool:
|
|
2646
2750
|
"""
|
|
2647
2751
|
Run KPCT in parallel by chunking the input file and processing chunks concurrently.
|
|
2648
2752
|
|
|
@@ -2693,7 +2797,7 @@ def run_kpct_parallel(kpct_input_file: str, savedir: str, kpct_outprefix: str, n
|
|
|
2693
2797
|
if all(os.path.exists(f) for f in final_outputs):
|
|
2694
2798
|
if logger:
|
|
2695
2799
|
logger.info("KPCT output files already exist, skipping parallel processing")
|
|
2696
|
-
|
|
2800
|
+
conditional_output("OK: KPCT output files already exist", "white", verbose)
|
|
2697
2801
|
return True
|
|
2698
2802
|
|
|
2699
2803
|
|
|
@@ -2702,7 +2806,7 @@ def run_kpct_parallel(kpct_input_file: str, savedir: str, kpct_outprefix: str, n
|
|
|
2702
2806
|
if logger:
|
|
2703
2807
|
logger.info(f"Running KPCT in parallel with up to {n_chunks} chunks using {ncpus} CPU cores")
|
|
2704
2808
|
|
|
2705
|
-
|
|
2809
|
+
conditional_output(f"Running KPCT in parallel with up to {n_chunks} chunks", "yellow", verbose)
|
|
2706
2810
|
|
|
2707
2811
|
|
|
2708
2812
|
chunks_base_dir = os.path.join(get_tmp_dir(savedir), "kpct_chunk_outputs")
|
|
@@ -2721,7 +2825,7 @@ def run_kpct_parallel(kpct_input_file: str, savedir: str, kpct_outprefix: str, n
|
|
|
2721
2825
|
if all_chunks_exist:
|
|
2722
2826
|
if logger:
|
|
2723
2827
|
logger.info("All chunk outputs already exist, proceeding to concatenation")
|
|
2724
|
-
|
|
2828
|
+
conditional_output("OK: All chunks already processed, concatenating results", "white", verbose)
|
|
2725
2829
|
|
|
2726
2830
|
|
|
2727
2831
|
concatenation_success = concatenate_kpct_outputs(existing_chunk_dirs, savedir, kpct_outprefix, logger)
|
|
@@ -2778,11 +2882,11 @@ def run_kpct_parallel(kpct_input_file: str, savedir: str, kpct_outprefix: str, n
|
|
|
2778
2882
|
if not chunks_to_process:
|
|
2779
2883
|
if logger:
|
|
2780
2884
|
logger.info("All chunks already processed, proceeding to concatenation")
|
|
2781
|
-
|
|
2885
|
+
conditional_output("OK: All chunks already processed, concatenating results", "white", verbose)
|
|
2782
2886
|
else:
|
|
2783
2887
|
if logger:
|
|
2784
2888
|
logger.info(f"Processing {len(chunks_to_process)} remaining chunks")
|
|
2785
|
-
|
|
2889
|
+
conditional_output(f"Processing {len(chunks_to_process)} remaining chunks", "yellow", verbose)
|
|
2786
2890
|
|
|
2787
2891
|
|
|
2788
2892
|
failed_chunks = []
|
|
@@ -2824,9 +2928,7 @@ def run_kpct_parallel(kpct_input_file: str, savedir: str, kpct_outprefix: str, n
|
|
|
2824
2928
|
|
|
2825
2929
|
if failed_chunks:
|
|
2826
2930
|
error_msg = f"Failed to process {len(failed_chunks)} chunks: {failed_chunks}"
|
|
2827
|
-
|
|
2828
|
-
logger.error(error_msg)
|
|
2829
|
-
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
2931
|
+
log_error(error_msg, logger=logger)
|
|
2830
2932
|
return False
|
|
2831
2933
|
|
|
2832
2934
|
|
|
@@ -2837,9 +2939,7 @@ def run_kpct_parallel(kpct_input_file: str, savedir: str, kpct_outprefix: str, n
|
|
|
2837
2939
|
|
|
2838
2940
|
if not all_chunks_exist:
|
|
2839
2941
|
error_msg = f"Not all chunks were processed successfully. Expected {len(all_chunk_dirs)}, got {len(final_chunk_dirs)}"
|
|
2840
|
-
|
|
2841
|
-
logger.error(error_msg)
|
|
2842
|
-
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
2942
|
+
log_error(error_msg, logger=logger)
|
|
2843
2943
|
return False
|
|
2844
2944
|
|
|
2845
2945
|
|
|
@@ -2855,26 +2955,29 @@ def run_kpct_parallel(kpct_input_file: str, savedir: str, kpct_outprefix: str, n
|
|
|
2855
2955
|
missing_outputs = [f for f in final_outputs if not os.path.exists(f)]
|
|
2856
2956
|
if missing_outputs:
|
|
2857
2957
|
error_msg = f"Failed to create final output files: {missing_outputs}"
|
|
2858
|
-
|
|
2859
|
-
logger.error(error_msg)
|
|
2860
|
-
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
2958
|
+
log_error(error_msg, logger=logger)
|
|
2861
2959
|
return False
|
|
2862
2960
|
|
|
2863
2961
|
if logger:
|
|
2864
2962
|
logger.info("Successfully completed parallel KPCT processing")
|
|
2865
|
-
|
|
2963
|
+
conditional_output("OK: KPCT parallel processing completed successfully", "green", verbose)
|
|
2866
2964
|
|
|
2867
2965
|
return True
|
|
2868
2966
|
|
|
2869
2967
|
except Exception as e:
|
|
2870
2968
|
error_msg = f"Error in parallel KPCT processing: {str(e)}"
|
|
2871
|
-
|
|
2872
|
-
logger.error(error_msg)
|
|
2873
|
-
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
2969
|
+
log_error(error_msg, logger=logger)
|
|
2874
2970
|
return False
|
|
2875
2971
|
|
|
2876
2972
|
|
|
2877
|
-
def run_kpct(
|
|
2973
|
+
def run_kpct(
|
|
2974
|
+
kpct_input_file: str,
|
|
2975
|
+
savedir: str,
|
|
2976
|
+
kpct_outprefix: str,
|
|
2977
|
+
resource_log_file: str,
|
|
2978
|
+
logger: Optional[logging.Logger] = None,
|
|
2979
|
+
verbose: bool = True,
|
|
2980
|
+
) -> bool:
|
|
2878
2981
|
"""
|
|
2879
2982
|
Run the KPCT give_completeness tool (sequential version).
|
|
2880
2983
|
This function is kept as a fallback in case parallel processing fails.
|
|
@@ -2912,21 +3015,19 @@ def run_kpct(kpct_input_file: str, savedir: str, kpct_outprefix: str, resource_l
|
|
|
2912
3015
|
resource_log_file,
|
|
2913
3016
|
logger,
|
|
2914
3017
|
"Running KPCT give_completeness tool (sequential)",
|
|
2915
|
-
|
|
3018
|
+
verbose,
|
|
2916
3019
|
)
|
|
2917
3020
|
|
|
2918
3021
|
if returncode != 0:
|
|
2919
3022
|
error_msg = f"KPCT tool failed with return code {returncode}"
|
|
2920
3023
|
if stderr:
|
|
2921
3024
|
error_msg += f": {stderr}"
|
|
2922
|
-
|
|
2923
|
-
logger.error(error_msg)
|
|
2924
|
-
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
3025
|
+
log_error(error_msg, logger=logger)
|
|
2925
3026
|
return False
|
|
2926
3027
|
|
|
2927
3028
|
|
|
2928
3029
|
if logger and stdout:
|
|
2929
|
-
logger
|
|
3030
|
+
_log_lines(logger, f"KPCT stdout:\n{stdout}", logging.INFO)
|
|
2930
3031
|
|
|
2931
3032
|
|
|
2932
3033
|
possible_kpct_files = [
|
|
@@ -2939,28 +3040,36 @@ def run_kpct(kpct_input_file: str, savedir: str, kpct_outprefix: str, resource_l
|
|
|
2939
3040
|
|
|
2940
3041
|
if not kpct_file_exists:
|
|
2941
3042
|
error_msg = f"KPCT did not generate any output files with prefix '{kpct_outprefix}'"
|
|
2942
|
-
|
|
2943
|
-
logger.error(error_msg)
|
|
2944
|
-
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
3043
|
+
log_error(error_msg, logger=logger)
|
|
2945
3044
|
return False
|
|
2946
3045
|
|
|
2947
3046
|
|
|
2948
3047
|
created_files = [f for f in possible_kpct_files if os.path.exists(f)]
|
|
2949
3048
|
if logger:
|
|
2950
3049
|
logger.info(f"KPCT successfully created output files: {created_files}")
|
|
2951
|
-
|
|
3050
|
+
conditional_output(
|
|
3051
|
+
f"OK: KPCT completed successfully. Created files: {[os.path.basename(f) for f in created_files]}",
|
|
3052
|
+
"green",
|
|
3053
|
+
verbose,
|
|
3054
|
+
)
|
|
2952
3055
|
|
|
2953
3056
|
return True
|
|
2954
3057
|
|
|
2955
3058
|
except Exception as e:
|
|
2956
3059
|
error_msg = f"Error running KPCT: {str(e)}"
|
|
2957
|
-
|
|
2958
|
-
logger.error(error_msg)
|
|
2959
|
-
typer.secho(f"ERROR: {error_msg}", fg="red")
|
|
3060
|
+
log_error(error_msg, logger=logger)
|
|
2960
3061
|
return False
|
|
2961
3062
|
|
|
2962
3063
|
|
|
2963
|
-
def run_kpct_with_fallback(
|
|
3064
|
+
def run_kpct_with_fallback(
|
|
3065
|
+
kpct_input_file: str,
|
|
3066
|
+
savedir: str,
|
|
3067
|
+
kpct_outprefix: str,
|
|
3068
|
+
ncpus: int,
|
|
3069
|
+
resource_log_file: str,
|
|
3070
|
+
logger: Optional[logging.Logger] = None,
|
|
3071
|
+
verbose: bool = True,
|
|
3072
|
+
) -> bool:
|
|
2964
3073
|
"""
|
|
2965
3074
|
Run KPCT with parallel processing and fallback to sequential if parallel fails.
|
|
2966
3075
|
|
|
@@ -2987,20 +3096,27 @@ def run_kpct_with_fallback(kpct_input_file: str, savedir: str, kpct_outprefix: s
|
|
|
2987
3096
|
if logger:
|
|
2988
3097
|
logger.info("Attempting parallel KPCT processing")
|
|
2989
3098
|
|
|
2990
|
-
parallel_success = run_kpct_parallel(
|
|
3099
|
+
parallel_success = run_kpct_parallel(
|
|
3100
|
+
kpct_input_file,
|
|
3101
|
+
savedir,
|
|
3102
|
+
kpct_outprefix,
|
|
3103
|
+
ncpus,
|
|
3104
|
+
logger,
|
|
3105
|
+
verbose=verbose,
|
|
3106
|
+
)
|
|
2991
3107
|
|
|
2992
3108
|
if parallel_success:
|
|
2993
3109
|
return True
|
|
2994
3110
|
else:
|
|
2995
3111
|
if logger:
|
|
2996
3112
|
logger.warning("Parallel KPCT processing failed, falling back to sequential processing")
|
|
2997
|
-
|
|
3113
|
+
log_warning("Parallel processing failed, trying sequential approach", logger=logger)
|
|
2998
3114
|
|
|
2999
3115
|
|
|
3000
3116
|
if logger:
|
|
3001
3117
|
logger.info("Running KPCT in sequential mode")
|
|
3002
3118
|
|
|
3003
|
-
return run_kpct(kpct_input_file, savedir, kpct_outprefix, resource_log_file, logger)
|
|
3119
|
+
return run_kpct(kpct_input_file, savedir, kpct_outprefix, resource_log_file, logger, verbose=verbose)
|
|
3004
3120
|
|
|
3005
3121
|
|
|
3006
3122
|
app = typer.Typer()
|
|
@@ -3012,7 +3128,12 @@ def pipeline(genomedir: str,
|
|
|
3012
3128
|
adapt_headers: bool=False,
|
|
3013
3129
|
del_tmp: bool=True,
|
|
3014
3130
|
calculate_complementarity: int=0,
|
|
3015
|
-
lowmem: bool = typer.Option(
|
|
3131
|
+
lowmem: bool = typer.Option(
|
|
3132
|
+
False,
|
|
3133
|
+
"--lowmem/--fullmem",
|
|
3134
|
+
"--low-mem/--full-mem",
|
|
3135
|
+
help="Run emapper with reduced memory footprint, omitting --dbmem flag.",
|
|
3136
|
+
),
|
|
3016
3137
|
verbose: bool = typer.Option(False, "--verbose", help="Enable verbose output with detailed progress information."),
|
|
3017
3138
|
log_level: str = typer.Option("INFO", "--log-level", "-l", help="Logging level (DEBUG, INFO, WARNING, ERROR)."),
|
|
3018
3139
|
eggnog_data_dir: Optional[str] = typer.Option(None, "--eggnog-data-dir", help="Path to eggNOG-mapper data directory (sets EGGNOG_DATA_DIR)."),
|
|
@@ -3066,6 +3187,7 @@ def pipeline(genomedir: str,
|
|
|
3066
3187
|
# Setup logging first to capture everything
|
|
3067
3188
|
log_dir = Path(savedir) / "logs"
|
|
3068
3189
|
logger = configure_logging(log_level, log_dir)
|
|
3190
|
+
RESOURCE_SUMMARIES.clear()
|
|
3069
3191
|
logger.info("Starting moducomp pipeline.")
|
|
3070
3192
|
logger.info("Genome directory: %s", genomedir)
|
|
3071
3193
|
logger.info("Output directory: %s", savedir)
|
|
@@ -3092,7 +3214,7 @@ def _run_pipeline_core(genomedir: str, savedir: str, ncpus: int, adapt_headers:
|
|
|
3092
3214
|
start_time = time.time()
|
|
3093
3215
|
|
|
3094
3216
|
greetings(verbose)
|
|
3095
|
-
conditional_output("
|
|
3217
|
+
conditional_output("Initializing pipeline...", "green", verbose)
|
|
3096
3218
|
|
|
3097
3219
|
# Convert to absolute paths
|
|
3098
3220
|
genomedir = os.path.abspath(genomedir)
|
|
@@ -3175,21 +3297,19 @@ def _run_pipeline_core(genomedir: str, savedir: str, ncpus: int, adapt_headers:
|
|
|
3175
3297
|
logger.info("Starting genome merging")
|
|
3176
3298
|
merge_success = merge_genomes(savedir, logger, verbose)
|
|
3177
3299
|
if not merge_success:
|
|
3178
|
-
|
|
3179
|
-
typer.secho("ERROR: Failed to merge genomes. Exiting pipeline.", fg="red")
|
|
3300
|
+
log_error("Failed to merge genomes. Exiting pipeline.", logger=logger)
|
|
3180
3301
|
return
|
|
3181
3302
|
|
|
3182
3303
|
# Run eggNOG-mapper
|
|
3183
3304
|
logger.info(f"Starting eMapper with {ncpus} CPUs")
|
|
3184
3305
|
emapper_success = run_emapper(savedir, ncpus, resource_log_file, lowmem, logger, verbose)
|
|
3185
3306
|
if not emapper_success:
|
|
3186
|
-
|
|
3187
|
-
typer.secho("ERROR: Failed to run emapper. Exiting pipeline.", fg="red")
|
|
3307
|
+
log_error("Failed to run emapper. Exiting pipeline.", logger=logger)
|
|
3188
3308
|
return
|
|
3189
3309
|
|
|
3190
3310
|
# Create KO matrix from annotations
|
|
3191
3311
|
logger.info(f"Creating KO matrix from eMapper annotations: {emapper_annotation_file}")
|
|
3192
|
-
create_ko_matrix_from_emapper_annotation(emapper_annotation_file, ko_matrix_path, logger)
|
|
3312
|
+
create_ko_matrix_from_emapper_annotation(emapper_annotation_file, ko_matrix_path, logger, verbose)
|
|
3193
3313
|
logger.info(f"KO matrix created: {ko_matrix_path}")
|
|
3194
3314
|
|
|
3195
3315
|
# Process module completeness
|
|
@@ -3197,7 +3317,11 @@ def _run_pipeline_core(genomedir: str, savedir: str, ncpus: int, adapt_headers:
|
|
|
3197
3317
|
|
|
3198
3318
|
if os.path.exists(module_completeness_file):
|
|
3199
3319
|
logger.info(f"Module completeness matrix already exists: {module_completeness_file}")
|
|
3200
|
-
|
|
3320
|
+
conditional_output(
|
|
3321
|
+
f"OK: Using existing module completeness matrix: {module_completeness_file}",
|
|
3322
|
+
"white",
|
|
3323
|
+
verbose,
|
|
3324
|
+
)
|
|
3201
3325
|
else:
|
|
3202
3326
|
# Set up KPCT processing
|
|
3203
3327
|
kpct_outprefix = "output_give_completeness"
|
|
@@ -3215,10 +3339,10 @@ def _run_pipeline_core(genomedir: str, savedir: str, ncpus: int, adapt_headers:
|
|
|
3215
3339
|
# Convert KO matrix to KPCT format if needed
|
|
3216
3340
|
if not os.path.exists(kpct_input_file):
|
|
3217
3341
|
logger.info(f"Converting KO matrix to KPCT format: {ko_matrix_path}")
|
|
3218
|
-
ko_matrix_to_kpct_format(ko_matrix_path, savedir, calculate_complementarity, logger)
|
|
3342
|
+
ko_matrix_to_kpct_format(ko_matrix_path, savedir, calculate_complementarity, logger, verbose)
|
|
3219
3343
|
else:
|
|
3220
3344
|
logger.info(f"KPCT input file already exists: {kpct_input_file}")
|
|
3221
|
-
|
|
3345
|
+
conditional_output(f"OK: Using existing KPCT input file: {kpct_input_file}", "white", verbose)
|
|
3222
3346
|
|
|
3223
3347
|
# Run KPCT if needed
|
|
3224
3348
|
if not kpct_file_exists:
|
|
@@ -3228,16 +3352,28 @@ def _run_pipeline_core(genomedir: str, savedir: str, ncpus: int, adapt_headers:
|
|
|
3228
3352
|
|
|
3229
3353
|
# Run KPCT with parallel processing
|
|
3230
3354
|
logger.info(f"Running KPCT with parallel processing on file: {kpct_input_file}")
|
|
3231
|
-
kpct_success = run_kpct_with_fallback(
|
|
3355
|
+
kpct_success = run_kpct_with_fallback(
|
|
3356
|
+
kpct_input_file,
|
|
3357
|
+
savedir,
|
|
3358
|
+
kpct_outprefix,
|
|
3359
|
+
ncpus,
|
|
3360
|
+
resource_log_file,
|
|
3361
|
+
logger,
|
|
3362
|
+
verbose=verbose,
|
|
3363
|
+
)
|
|
3232
3364
|
if not kpct_success:
|
|
3233
3365
|
return
|
|
3234
3366
|
else:
|
|
3235
3367
|
logger.info(f"KPCT output file(s) already exist with prefix '{kpct_outprefix}'")
|
|
3236
|
-
|
|
3368
|
+
conditional_output(
|
|
3369
|
+
f"OK: Using existing KPCT output files with prefix '{kpct_outprefix}'",
|
|
3370
|
+
"white",
|
|
3371
|
+
verbose,
|
|
3372
|
+
)
|
|
3237
3373
|
|
|
3238
3374
|
# Create module completeness matrix
|
|
3239
3375
|
logger.info(f"Creating module completeness matrix")
|
|
3240
|
-
create_module_completeness_matrix(savedir, kpct_outprefix, logger)
|
|
3376
|
+
create_module_completeness_matrix(savedir, kpct_outprefix, logger, verbose)
|
|
3241
3377
|
|
|
3242
3378
|
# Generate complementarity reports if requested
|
|
3243
3379
|
if calculate_complementarity >= 2:
|
|
@@ -3248,7 +3384,11 @@ def _run_pipeline_core(genomedir: str, savedir: str, ncpus: int, adapt_headers:
|
|
|
3248
3384
|
complementarity_report_file = f"{savedir}/module_completeness_complementarity_{n_members}member.tsv"
|
|
3249
3385
|
if os.path.exists(complementarity_report_file):
|
|
3250
3386
|
logger.info(f"Complementarity report for {n_members}-member combinations already exists: {complementarity_report_file}")
|
|
3251
|
-
|
|
3387
|
+
conditional_output(
|
|
3388
|
+
f"OK: Using existing {n_members}-member complementarity report: {complementarity_report_file}",
|
|
3389
|
+
"white",
|
|
3390
|
+
verbose,
|
|
3391
|
+
)
|
|
3252
3392
|
else:
|
|
3253
3393
|
logger.info(f"Generating complementarity report for {n_members}-member combinations")
|
|
3254
3394
|
generate_complementarity_report(savedir, n_members, logger, verbose)
|
|
@@ -3256,7 +3396,7 @@ def _run_pipeline_core(genomedir: str, savedir: str, ncpus: int, adapt_headers:
|
|
|
3256
3396
|
# Clean up temporary files if requested
|
|
3257
3397
|
if del_tmp:
|
|
3258
3398
|
logger.info("Cleaning up temporary files")
|
|
3259
|
-
remove_temp_files(savedir, logger)
|
|
3399
|
+
remove_temp_files(savedir, logger, verbose)
|
|
3260
3400
|
|
|
3261
3401
|
# Generate final resource usage summary
|
|
3262
3402
|
log_final_resource_summary(resource_log_file, start_time, logger, verbose)
|
|
@@ -3299,7 +3439,8 @@ def test(
|
|
|
3299
3439
|
lowmem: bool = typer.Option(
|
|
3300
3440
|
True,
|
|
3301
3441
|
"--lowmem/--fullmem",
|
|
3302
|
-
|
|
3442
|
+
"--low-mem/--full-mem",
|
|
3443
|
+
help="Run emapper with reduced memory footprint during the test (default: low-mem).",
|
|
3303
3444
|
),
|
|
3304
3445
|
verbose: bool = typer.Option(
|
|
3305
3446
|
True,
|
|
@@ -3331,6 +3472,7 @@ def test(
|
|
|
3331
3472
|
|
|
3332
3473
|
log_dir = Path(savedir) / "logs"
|
|
3333
3474
|
logger = configure_logging(log_level, log_dir)
|
|
3475
|
+
RESOURCE_SUMMARIES.clear()
|
|
3334
3476
|
logger.info("Starting moducomp test run.")
|
|
3335
3477
|
logger.info("Test genomes: %s", test_root)
|
|
3336
3478
|
logger.info("CLI command: %s", " ".join(shlex.quote(arg) for arg in sys.argv))
|
|
@@ -3446,6 +3588,9 @@ def download_eggnog_data(
|
|
|
3446
3588
|
stdout_thread.start()
|
|
3447
3589
|
stderr_thread.start()
|
|
3448
3590
|
|
|
3591
|
+
output_level = logging.INFO if verbose else logging.DEBUG
|
|
3592
|
+
error_level = logging.WARNING if verbose else logging.DEBUG
|
|
3593
|
+
|
|
3449
3594
|
while process.poll() is None or not stdout_queue.empty() or not stderr_queue.empty():
|
|
3450
3595
|
now = time.time()
|
|
3451
3596
|
|
|
@@ -3454,9 +3599,8 @@ def download_eggnog_data(
|
|
|
3454
3599
|
while True:
|
|
3455
3600
|
stream_type, line = stdout_queue.get_nowait()
|
|
3456
3601
|
if line:
|
|
3457
|
-
if
|
|
3458
|
-
|
|
3459
|
-
logger.info(line)
|
|
3602
|
+
if logger:
|
|
3603
|
+
_log_lines(logger, line, output_level)
|
|
3460
3604
|
except queue.Empty:
|
|
3461
3605
|
pass
|
|
3462
3606
|
|
|
@@ -3465,9 +3609,8 @@ def download_eggnog_data(
|
|
|
3465
3609
|
while True:
|
|
3466
3610
|
stream_type, line = stderr_queue.get_nowait()
|
|
3467
3611
|
if line:
|
|
3468
|
-
if
|
|
3469
|
-
|
|
3470
|
-
logger.warning(line)
|
|
3612
|
+
if logger:
|
|
3613
|
+
_log_lines(logger, line, error_level)
|
|
3471
3614
|
except queue.Empty:
|
|
3472
3615
|
pass
|
|
3473
3616
|
|
|
@@ -3506,8 +3649,6 @@ def download_eggnog_data(
|
|
|
3506
3649
|
f"across {total_files} files"
|
|
3507
3650
|
)
|
|
3508
3651
|
logger.info(summary)
|
|
3509
|
-
if verbose:
|
|
3510
|
-
typer.secho(summary, fg="green")
|
|
3511
3652
|
|
|
3512
3653
|
if returncode != 0:
|
|
3513
3654
|
raise typer.Exit(returncode)
|
|
@@ -3578,6 +3719,7 @@ def analyze_ko_matrix(
|
|
|
3578
3719
|
|
|
3579
3720
|
log_dir = Path(savedir) / "logs"
|
|
3580
3721
|
logger = configure_logging(log_level, log_dir)
|
|
3722
|
+
RESOURCE_SUMMARIES.clear()
|
|
3581
3723
|
|
|
3582
3724
|
# Setup resource monitoring
|
|
3583
3725
|
resource_log_file = setup_resource_logging(log_dir)
|
|
@@ -3589,11 +3731,11 @@ def analyze_ko_matrix(
|
|
|
3589
3731
|
logger.info("CLI command: %s", " ".join(shlex.quote(arg) for arg in sys.argv))
|
|
3590
3732
|
|
|
3591
3733
|
greetings(verbose)
|
|
3592
|
-
conditional_output("
|
|
3734
|
+
conditional_output("Initializing KO matrix analysis...", "green", verbose)
|
|
3593
3735
|
|
|
3594
3736
|
|
|
3595
3737
|
if not os.path.exists(kos_matrix):
|
|
3596
|
-
|
|
3738
|
+
log_error(f"KO matrix file not found at: {kos_matrix}", logger=logger)
|
|
3597
3739
|
exit(1)
|
|
3598
3740
|
|
|
3599
3741
|
|
|
@@ -3607,7 +3749,7 @@ def analyze_ko_matrix(
|
|
|
3607
3749
|
|
|
3608
3750
|
|
|
3609
3751
|
if check_final_reports_exist(savedir, calculate_complementarity, logger):
|
|
3610
|
-
|
|
3752
|
+
conditional_output("OK: All output files already exist. Skipping processing.", "green", verbose)
|
|
3611
3753
|
logger.info("Analysis skipped as all output files already exist")
|
|
3612
3754
|
return
|
|
3613
3755
|
|
|
@@ -3642,10 +3784,10 @@ def analyze_ko_matrix(
|
|
|
3642
3784
|
|
|
3643
3785
|
if not os.path.exists(kpct_input_file):
|
|
3644
3786
|
logger.info(f"Converting KO matrix to KPCT format: {kos_matrix}")
|
|
3645
|
-
ko_matrix_to_kpct_format(kos_matrix, savedir, calculate_complementarity, logger)
|
|
3787
|
+
ko_matrix_to_kpct_format(kos_matrix, savedir, calculate_complementarity, logger, verbose)
|
|
3646
3788
|
else:
|
|
3647
3789
|
logger.info(f"KPCT input file already exists: {kpct_input_file}")
|
|
3648
|
-
|
|
3790
|
+
conditional_output(f"OK: Using existing KPCT input file: {kpct_input_file}", "white", verbose)
|
|
3649
3791
|
|
|
3650
3792
|
|
|
3651
3793
|
if not kpct_file_exists:
|
|
@@ -3655,22 +3797,38 @@ def analyze_ko_matrix(
|
|
|
3655
3797
|
|
|
3656
3798
|
|
|
3657
3799
|
logger.info(f"Running KPCT with parallel processing on file: {kpct_input_file}")
|
|
3658
|
-
kpct_success = run_kpct_with_fallback(
|
|
3800
|
+
kpct_success = run_kpct_with_fallback(
|
|
3801
|
+
kpct_input_file,
|
|
3802
|
+
savedir,
|
|
3803
|
+
kpct_outprefix,
|
|
3804
|
+
ncpus,
|
|
3805
|
+
resource_log_file,
|
|
3806
|
+
logger,
|
|
3807
|
+
verbose=verbose,
|
|
3808
|
+
)
|
|
3659
3809
|
if not kpct_success:
|
|
3660
3810
|
exit(1)
|
|
3661
3811
|
else:
|
|
3662
3812
|
logger.info(f"KPCT output file(s) already exist with prefix '{kpct_outprefix}'")
|
|
3663
|
-
|
|
3813
|
+
conditional_output(
|
|
3814
|
+
f"OK: Using existing KPCT output files with prefix '{kpct_outprefix}'",
|
|
3815
|
+
"white",
|
|
3816
|
+
verbose,
|
|
3817
|
+
)
|
|
3664
3818
|
|
|
3665
3819
|
|
|
3666
3820
|
if not os.path.exists(module_completeness_file):
|
|
3667
3821
|
if logger:
|
|
3668
3822
|
logger.info(f"Creating module completeness matrix")
|
|
3669
|
-
create_module_completeness_matrix(savedir, kpct_outprefix, logger)
|
|
3823
|
+
create_module_completeness_matrix(savedir, kpct_outprefix, logger, verbose)
|
|
3670
3824
|
else:
|
|
3671
3825
|
if logger:
|
|
3672
3826
|
logger.info(f"Module completeness matrix already exists: {module_completeness_file}")
|
|
3673
|
-
|
|
3827
|
+
conditional_output(
|
|
3828
|
+
f"OK: Using existing module completeness matrix: {module_completeness_file}",
|
|
3829
|
+
"white",
|
|
3830
|
+
verbose,
|
|
3831
|
+
)
|
|
3674
3832
|
|
|
3675
3833
|
|
|
3676
3834
|
if calculate_complementarity >= 2:
|
|
@@ -3682,7 +3840,11 @@ def analyze_ko_matrix(
|
|
|
3682
3840
|
complementarity_report_file = f"{savedir}/module_completeness_complementarity_{n_members}member.tsv"
|
|
3683
3841
|
if os.path.exists(complementarity_report_file):
|
|
3684
3842
|
logger.info(f"Complementarity report for {n_members}-member combinations already exists: {complementarity_report_file}")
|
|
3685
|
-
|
|
3843
|
+
conditional_output(
|
|
3844
|
+
f"OK: Using existing {n_members}-member complementarity report: {complementarity_report_file}",
|
|
3845
|
+
"white",
|
|
3846
|
+
verbose,
|
|
3847
|
+
)
|
|
3686
3848
|
else:
|
|
3687
3849
|
logger.info(f"Generating complementarity report for {n_members}-member combinations")
|
|
3688
3850
|
generate_complementarity_report(savedir, n_members, logger, verbose)
|
|
@@ -3691,7 +3853,7 @@ def analyze_ko_matrix(
|
|
|
3691
3853
|
if del_tmp:
|
|
3692
3854
|
if logger:
|
|
3693
3855
|
logger.info("Cleaning up temporary files")
|
|
3694
|
-
remove_temp_files(savedir, logger)
|
|
3856
|
+
remove_temp_files(savedir, logger, verbose)
|
|
3695
3857
|
|
|
3696
3858
|
# Generate final resource usage summary
|
|
3697
3859
|
log_final_resource_summary(resource_log_file, start_time, logger, verbose)
|
|
@@ -3702,7 +3864,8 @@ def analyze_ko_matrix(
|
|
|
3702
3864
|
except Exception as e:
|
|
3703
3865
|
if logger:
|
|
3704
3866
|
logger.error(f"Error in KPCT analysis: {str(e)}", exc_info=True)
|
|
3705
|
-
|
|
3867
|
+
else:
|
|
3868
|
+
log_error(f"Error in KPCT analysis: {str(e)}", logger=logger)
|
|
3706
3869
|
exit(1)
|
|
3707
3870
|
|
|
3708
3871
|
|