dayhoff-tools 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dayhoff_tools/__init__.py +0 -0
- dayhoff_tools/chemistry/standardizer.py +297 -0
- dayhoff_tools/chemistry/utils.py +63 -0
- dayhoff_tools/cli/__init__.py +0 -0
- dayhoff_tools/cli/main.py +90 -0
- dayhoff_tools/cli/swarm_commands.py +156 -0
- dayhoff_tools/cli/utility_commands.py +244 -0
- dayhoff_tools/deployment/base.py +434 -0
- dayhoff_tools/deployment/deploy_aws.py +458 -0
- dayhoff_tools/deployment/deploy_gcp.py +176 -0
- dayhoff_tools/deployment/deploy_utils.py +781 -0
- dayhoff_tools/deployment/job_runner.py +153 -0
- dayhoff_tools/deployment/processors.py +125 -0
- dayhoff_tools/deployment/swarm.py +591 -0
- dayhoff_tools/embedders.py +893 -0
- dayhoff_tools/fasta.py +1082 -0
- dayhoff_tools/file_ops.py +261 -0
- dayhoff_tools/gcp.py +85 -0
- dayhoff_tools/h5.py +542 -0
- dayhoff_tools/kegg.py +37 -0
- dayhoff_tools/logs.py +27 -0
- dayhoff_tools/mmseqs.py +164 -0
- dayhoff_tools/sqlite.py +516 -0
- dayhoff_tools/structure.py +751 -0
- dayhoff_tools/uniprot.py +434 -0
- dayhoff_tools/warehouse.py +418 -0
- dayhoff_tools-1.0.0.dist-info/METADATA +122 -0
- dayhoff_tools-1.0.0.dist-info/RECORD +30 -0
- dayhoff_tools-1.0.0.dist-info/WHEEL +4 -0
- dayhoff_tools-1.0.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,153 @@
|
|
1
|
+
"""Job runner for container-based deployments.
|
2
|
+
|
3
|
+
This module serves as the unified entry point for both setup and execution modes.
|
4
|
+
It can be run in three ways:
|
5
|
+
1. dh job setup - Only performs environment setup
|
6
|
+
2. dh job execute - Only executes the command (assumes setup is done)
|
7
|
+
3. dh job setup_and_execute - Performs setup and then executes (default)
|
8
|
+
|
9
|
+
Configuration is provided through environment variables:
|
10
|
+
- JOB_COMMAND: Command to execute (if any)
|
11
|
+
- REPO_ROOT: Optional path to repository root directory (for container environments)
|
12
|
+
- GOOGLE_APPLICATION_CREDENTIALS_BASE64: Enables GCP authentication when present
|
13
|
+
- USE_DVC: Set to "true" to enable DVC (requires GCP auth)
|
14
|
+
- USE_RXNFP: Set to "true" to enable RXNFP library
|
15
|
+
- FAIL_WITHOUT_GPU: Set to "true" to fail if GPU is unavailable
|
16
|
+
|
17
|
+
Additional environment variables are preserved and passed to the job orchestrator.
|
18
|
+
"""
|
19
|
+
|
20
|
+
import logging
|
21
|
+
import os
|
22
|
+
import subprocess
|
23
|
+
import sys
|
24
|
+
|
25
|
+
import typer
|
26
|
+
from dayhoff_tools.deployment.deploy_utils import (
|
27
|
+
SystemMonitor,
|
28
|
+
authenticate_gcp,
|
29
|
+
move_to_repo_root,
|
30
|
+
setup_dvc,
|
31
|
+
setup_rxnfp,
|
32
|
+
)
|
33
|
+
from dayhoff_tools.logs import configure_logs
|
34
|
+
|
35
|
+
logger = logging.getLogger(__name__)
|
36
|
+
|
37
|
+
|
38
|
+
def run_setup() -> None:
|
39
|
+
"""Run all enabled setup steps.
|
40
|
+
|
41
|
+
Each setup function checks its own requirements and skips if not enabled.
|
42
|
+
"""
|
43
|
+
logger.info("Starting job setup")
|
44
|
+
|
45
|
+
# Only log important environment variables
|
46
|
+
important_vars = [
|
47
|
+
"JOB_COMMAND",
|
48
|
+
"REPO_ROOT",
|
49
|
+
"USE_DVC",
|
50
|
+
"USE_RXNFP",
|
51
|
+
"FAIL_WITHOUT_GPU",
|
52
|
+
]
|
53
|
+
for key in important_vars:
|
54
|
+
if key in os.environ:
|
55
|
+
logger.info(f"{key}={os.environ[key]}")
|
56
|
+
|
57
|
+
move_to_repo_root()
|
58
|
+
|
59
|
+
# Run setup steps
|
60
|
+
authenticate_gcp() # Checks for GOOGLE_APPLICATION_CREDENTIALS_BASE64
|
61
|
+
setup_dvc() # Checks for USE_DVC="true"
|
62
|
+
setup_rxnfp() # Checks for USE_RXNFP="true"
|
63
|
+
logger.info("Setup completed successfully")
|
64
|
+
|
65
|
+
|
66
|
+
def run_command() -> None:
|
67
|
+
"""Execute the job command if specified.
|
68
|
+
|
69
|
+
Raises:
|
70
|
+
ValueError: If no job command is specified
|
71
|
+
"""
|
72
|
+
job_command = os.getenv("JOB_COMMAND")
|
73
|
+
if not job_command:
|
74
|
+
raise ValueError("No job command specified")
|
75
|
+
|
76
|
+
logger.info(f"Current working directory: {os.getcwd()}")
|
77
|
+
logger.info(f"Executing job command: {job_command}")
|
78
|
+
|
79
|
+
# Start monitoring if FAIL_WITHOUT_GPU is enabled
|
80
|
+
monitor = None
|
81
|
+
if os.getenv("FAIL_WITHOUT_GPU", "").lower() == "true":
|
82
|
+
logger.info("Starting system monitoring...")
|
83
|
+
monitor = SystemMonitor(fail_without_gpu=True)
|
84
|
+
monitor.start()
|
85
|
+
|
86
|
+
try:
|
87
|
+
# Run command directly, allowing output to flow to parent process
|
88
|
+
# This avoids buffering issues and simplifies logging
|
89
|
+
result = subprocess.run(
|
90
|
+
job_command,
|
91
|
+
shell=True,
|
92
|
+
check=True,
|
93
|
+
stdout=None, # Use parent's stdout
|
94
|
+
stderr=None, # Use parent's stderr
|
95
|
+
)
|
96
|
+
|
97
|
+
logger.info("Command completed successfully")
|
98
|
+
except subprocess.CalledProcessError as e:
|
99
|
+
logger.error(f"Command failed with return code: {e.returncode}")
|
100
|
+
raise
|
101
|
+
except Exception as e:
|
102
|
+
logger.error(f"Error executing command: {str(e)}")
|
103
|
+
raise
|
104
|
+
finally:
|
105
|
+
if monitor:
|
106
|
+
logger.info("Stopping system monitor")
|
107
|
+
monitor.stop()
|
108
|
+
|
109
|
+
|
110
|
+
def run_job(
|
111
|
+
mode: str = typer.Argument(
|
112
|
+
default="setup_and_execute",
|
113
|
+
help="Mode to run in: setup (setup only), execute (execute only), or setup_and_execute (both)",
|
114
|
+
)
|
115
|
+
) -> None:
|
116
|
+
"""Run a job command in the specified mode.
|
117
|
+
|
118
|
+
This function executes the job command given by the JOB_COMMAND environment variable,
|
119
|
+
if it is present. This method is meant for use in job containers after deployment.
|
120
|
+
|
121
|
+
Args:
|
122
|
+
mode: The execution mode to use. One of:
|
123
|
+
- setup: Only performs environment setup
|
124
|
+
- execute: Only executes the command (assumes setup is done)
|
125
|
+
- setup_and_execute: Performs setup and then executes (default)
|
126
|
+
|
127
|
+
Raises:
|
128
|
+
ValueError: If an invalid mode is specified
|
129
|
+
Exception: If any step of the process fails
|
130
|
+
"""
|
131
|
+
# Configure logging first thing
|
132
|
+
configure_logs()
|
133
|
+
logger = logging.getLogger(__name__)
|
134
|
+
|
135
|
+
logger.info(f"Job runner starting in mode: {mode}")
|
136
|
+
|
137
|
+
if mode not in ["setup", "execute", "setup_and_execute"]:
|
138
|
+
logger.error(f"Invalid mode: {mode}")
|
139
|
+
raise ValueError(f"Invalid mode: {mode}")
|
140
|
+
|
141
|
+
try:
|
142
|
+
# Run in appropriate mode
|
143
|
+
if mode in ["setup", "setup_and_execute"]:
|
144
|
+
run_setup()
|
145
|
+
|
146
|
+
if mode in ["execute", "setup_and_execute"]:
|
147
|
+
run_command()
|
148
|
+
|
149
|
+
logger.info("Job completed successfully")
|
150
|
+
|
151
|
+
except Exception as e:
|
152
|
+
logger.error(f"Job failed with error: {str(e)}", exc_info=True)
|
153
|
+
sys.exit(1)
|
@@ -0,0 +1,125 @@
|
|
1
|
+
import logging
|
2
|
+
import os
|
3
|
+
import subprocess
|
4
|
+
import shlex
|
5
|
+
from abc import ABC, abstractmethod
|
6
|
+
|
7
|
+
logger = logging.getLogger(__name__)
|
8
|
+
|
9
|
+
|
10
|
+
class Processor(ABC):
|
11
|
+
"""Processes data locally. Abstract class for specific calculations.
|
12
|
+
Takes in a single file and produces a single file or folder of outputs."""
|
13
|
+
|
14
|
+
@abstractmethod
|
15
|
+
def run(self, input_file: str) -> str:
|
16
|
+
"""Do the calculation, including reading from input_file
|
17
|
+
and writing to output_file"""
|
18
|
+
output_path = "output_file"
|
19
|
+
|
20
|
+
return output_path
|
21
|
+
|
22
|
+
|
23
|
+
class BoltzPredictor(Processor):
|
24
|
+
"""Processor for running Boltz docking predictions.
|
25
|
+
|
26
|
+
This class wraps the Boltz docking tool to predict protein structures
|
27
|
+
from sequence data.
|
28
|
+
"""
|
29
|
+
|
30
|
+
def __init__(self, num_workers: int, boltz_options: str | None = None):
|
31
|
+
"""Initialize the BoltzPredictor.
|
32
|
+
|
33
|
+
Args:
|
34
|
+
num_workers: Number of worker threads to use as a default.
|
35
|
+
This can be overridden if --num_workers is present
|
36
|
+
in boltz_options.
|
37
|
+
boltz_options: A string containing additional command-line options
|
38
|
+
to pass to the Boltz predictor. Options should be
|
39
|
+
space-separated (e.g., "--option1 value1 --option2").
|
40
|
+
"""
|
41
|
+
self.num_workers = num_workers
|
42
|
+
self.boltz_options = boltz_options
|
43
|
+
|
44
|
+
def run(self, input_file: str) -> str:
|
45
|
+
"""Run Boltz prediction on the input file.
|
46
|
+
|
47
|
+
Constructs the command using the input file, default number of workers,
|
48
|
+
and any additional options provided via `boltz_options`. If `--num_workers`
|
49
|
+
is specified in `boltz_options`, it overrides the default `num_workers`.
|
50
|
+
|
51
|
+
Args:
|
52
|
+
input_file: Path to the input file containing sequences
|
53
|
+
|
54
|
+
Returns:
|
55
|
+
Path to the output directory created by Boltz
|
56
|
+
|
57
|
+
Raises:
|
58
|
+
subprocess.CalledProcessError: If Boltz prediction fails
|
59
|
+
"""
|
60
|
+
# Determine expected output directory name
|
61
|
+
input_base = os.path.splitext(os.path.basename(input_file))[0]
|
62
|
+
expected_output_dir = f"boltz_results_{input_base}"
|
63
|
+
logger.info(f"Expected output directory: {expected_output_dir}")
|
64
|
+
|
65
|
+
# Start building the command
|
66
|
+
cmd = ["boltz", "predict", input_file]
|
67
|
+
|
68
|
+
# Parse additional options if provided
|
69
|
+
additional_args = []
|
70
|
+
num_workers_in_opts = False
|
71
|
+
if self.boltz_options:
|
72
|
+
try:
|
73
|
+
parsed_opts = shlex.split(self.boltz_options)
|
74
|
+
additional_args.extend(parsed_opts)
|
75
|
+
if "--num_workers" in parsed_opts:
|
76
|
+
num_workers_in_opts = True
|
77
|
+
logger.info(
|
78
|
+
f"Using --num_workers from BOLTZ_OPTIONS: {self.boltz_options}"
|
79
|
+
)
|
80
|
+
except ValueError as e:
|
81
|
+
logger.error(f"Error parsing BOLTZ_OPTIONS '{self.boltz_options}': {e}")
|
82
|
+
# Decide if we should raise an error or proceed without options
|
83
|
+
# For now, proceed without the additional options
|
84
|
+
additional_args = [] # Clear potentially partially parsed args
|
85
|
+
|
86
|
+
# Add num_workers if not specified in options
|
87
|
+
if not num_workers_in_opts:
|
88
|
+
logger.info(f"Using default num_workers: {self.num_workers}")
|
89
|
+
cmd.extend(["--num_workers", str(self.num_workers)])
|
90
|
+
|
91
|
+
# Add the parsed additional arguments
|
92
|
+
cmd.extend(additional_args)
|
93
|
+
|
94
|
+
# Log the final command
|
95
|
+
# Use shlex.join for safer command logging, especially if paths/args have spaces
|
96
|
+
try:
|
97
|
+
safe_cmd_str = shlex.join(cmd)
|
98
|
+
logger.info(f"Running command: {safe_cmd_str}")
|
99
|
+
except AttributeError: # shlex.join is Python 3.8+
|
100
|
+
logger.info(f"Running command: {' '.join(cmd)}")
|
101
|
+
|
102
|
+
# Stream output in real-time
|
103
|
+
process = subprocess.Popen(
|
104
|
+
cmd,
|
105
|
+
stdout=subprocess.PIPE,
|
106
|
+
stderr=subprocess.STDOUT,
|
107
|
+
text=True,
|
108
|
+
bufsize=1,
|
109
|
+
)
|
110
|
+
|
111
|
+
stdout = process.stdout
|
112
|
+
if stdout:
|
113
|
+
for line in iter(stdout.readline, ""):
|
114
|
+
logger.info(f"BOLTZ: {line.rstrip()}")
|
115
|
+
|
116
|
+
# Wait for process to complete
|
117
|
+
return_code = process.wait()
|
118
|
+
if return_code != 0:
|
119
|
+
logger.error(f"Boltz prediction failed with exit code {return_code}")
|
120
|
+
raise subprocess.CalledProcessError(return_code, cmd)
|
121
|
+
|
122
|
+
logger.info(
|
123
|
+
f"Boltz prediction completed successfully. Output in {expected_output_dir}"
|
124
|
+
)
|
125
|
+
return expected_output_dir
|