dayhoff-tools 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,153 @@
1
+ """Job runner for container-based deployments.
2
+
3
+ This module serves as the unified entry point for both setup and execution modes.
4
+ It can be run in three ways:
5
+ 1. dh job setup - Only performs environment setup
6
+ 2. dh job execute - Only executes the command (assumes setup is done)
7
+ 3. dh job setup_and_execute - Performs setup and then executes (default)
8
+
9
+ Configuration is provided through environment variables:
10
+ - JOB_COMMAND: Command to execute (if any)
11
+ - REPO_ROOT: Optional path to repository root directory (for container environments)
12
+ - GOOGLE_APPLICATION_CREDENTIALS_BASE64: Enables GCP authentication when present
13
+ - USE_DVC: Set to "true" to enable DVC (requires GCP auth)
14
+ - USE_RXNFP: Set to "true" to enable RXNFP library
15
+ - FAIL_WITHOUT_GPU: Set to "true" to fail if GPU is unavailable
16
+
17
+ Additional environment variables are preserved and passed to the job orchestrator.
18
+ """
19
+
20
+ import logging
21
+ import os
22
+ import subprocess
23
+ import sys
24
+
25
+ import typer
26
+ from dayhoff_tools.deployment.deploy_utils import (
27
+ SystemMonitor,
28
+ authenticate_gcp,
29
+ move_to_repo_root,
30
+ setup_dvc,
31
+ setup_rxnfp,
32
+ )
33
+ from dayhoff_tools.logs import configure_logs
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+
38
+ def run_setup() -> None:
39
+ """Run all enabled setup steps.
40
+
41
+ Each setup function checks its own requirements and skips if not enabled.
42
+ """
43
+ logger.info("Starting job setup")
44
+
45
+ # Only log important environment variables
46
+ important_vars = [
47
+ "JOB_COMMAND",
48
+ "REPO_ROOT",
49
+ "USE_DVC",
50
+ "USE_RXNFP",
51
+ "FAIL_WITHOUT_GPU",
52
+ ]
53
+ for key in important_vars:
54
+ if key in os.environ:
55
+ logger.info(f"{key}={os.environ[key]}")
56
+
57
+ move_to_repo_root()
58
+
59
+ # Run setup steps
60
+ authenticate_gcp() # Checks for GOOGLE_APPLICATION_CREDENTIALS_BASE64
61
+ setup_dvc() # Checks for USE_DVC="true"
62
+ setup_rxnfp() # Checks for USE_RXNFP="true"
63
+ logger.info("Setup completed successfully")
64
+
65
+
66
+ def run_command() -> None:
67
+ """Execute the job command if specified.
68
+
69
+ Raises:
70
+ ValueError: If no job command is specified
71
+ """
72
+ job_command = os.getenv("JOB_COMMAND")
73
+ if not job_command:
74
+ raise ValueError("No job command specified")
75
+
76
+ logger.info(f"Current working directory: {os.getcwd()}")
77
+ logger.info(f"Executing job command: {job_command}")
78
+
79
+ # Start monitoring if FAIL_WITHOUT_GPU is enabled
80
+ monitor = None
81
+ if os.getenv("FAIL_WITHOUT_GPU", "").lower() == "true":
82
+ logger.info("Starting system monitoring...")
83
+ monitor = SystemMonitor(fail_without_gpu=True)
84
+ monitor.start()
85
+
86
+ try:
87
+ # Run command directly, allowing output to flow to parent process
88
+ # This avoids buffering issues and simplifies logging
89
+ result = subprocess.run(
90
+ job_command,
91
+ shell=True,
92
+ check=True,
93
+ stdout=None, # Use parent's stdout
94
+ stderr=None, # Use parent's stderr
95
+ )
96
+
97
+ logger.info("Command completed successfully")
98
+ except subprocess.CalledProcessError as e:
99
+ logger.error(f"Command failed with return code: {e.returncode}")
100
+ raise
101
+ except Exception as e:
102
+ logger.error(f"Error executing command: {str(e)}")
103
+ raise
104
+ finally:
105
+ if monitor:
106
+ logger.info("Stopping system monitor")
107
+ monitor.stop()
108
+
109
+
110
+ def run_job(
111
+ mode: str = typer.Argument(
112
+ default="setup_and_execute",
113
+ help="Mode to run in: setup (setup only), execute (execute only), or setup_and_execute (both)",
114
+ )
115
+ ) -> None:
116
+ """Run a job command in the specified mode.
117
+
118
+ This function executes the job command given by the JOB_COMMAND environment variable,
119
+ if it is present. This method is meant for use in job containers after deployment.
120
+
121
+ Args:
122
+ mode: The execution mode to use. One of:
123
+ - setup: Only performs environment setup
124
+ - execute: Only executes the command (assumes setup is done)
125
+ - setup_and_execute: Performs setup and then executes (default)
126
+
127
+ Raises:
128
+ ValueError: If an invalid mode is specified
129
+ Exception: If any step of the process fails
130
+ """
131
+ # Configure logging first thing
132
+ configure_logs()
133
+ logger = logging.getLogger(__name__)
134
+
135
+ logger.info(f"Job runner starting in mode: {mode}")
136
+
137
+ if mode not in ["setup", "execute", "setup_and_execute"]:
138
+ logger.error(f"Invalid mode: {mode}")
139
+ raise ValueError(f"Invalid mode: {mode}")
140
+
141
+ try:
142
+ # Run in appropriate mode
143
+ if mode in ["setup", "setup_and_execute"]:
144
+ run_setup()
145
+
146
+ if mode in ["execute", "setup_and_execute"]:
147
+ run_command()
148
+
149
+ logger.info("Job completed successfully")
150
+
151
+ except Exception as e:
152
+ logger.error(f"Job failed with error: {str(e)}", exc_info=True)
153
+ sys.exit(1)
@@ -0,0 +1,125 @@
1
+ import logging
2
+ import os
3
+ import subprocess
4
+ import shlex
5
+ from abc import ABC, abstractmethod
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class Processor(ABC):
11
+ """Processes data locally. Abstract class for specific calculations.
12
+ Takes in a single file and produces a single file or folder of outputs."""
13
+
14
+ @abstractmethod
15
+ def run(self, input_file: str) -> str:
16
+ """Do the calculation, including reading from input_file
17
+ and writing to output_file"""
18
+ output_path = "output_file"
19
+
20
+ return output_path
21
+
22
+
23
+ class BoltzPredictor(Processor):
24
+ """Processor for running Boltz docking predictions.
25
+
26
+ This class wraps the Boltz docking tool to predict protein structures
27
+ from sequence data.
28
+ """
29
+
30
+ def __init__(self, num_workers: int, boltz_options: str | None = None):
31
+ """Initialize the BoltzPredictor.
32
+
33
+ Args:
34
+ num_workers: Number of worker threads to use as a default.
35
+ This can be overridden if --num_workers is present
36
+ in boltz_options.
37
+ boltz_options: A string containing additional command-line options
38
+ to pass to the Boltz predictor. Options should be
39
+ space-separated (e.g., "--option1 value1 --option2").
40
+ """
41
+ self.num_workers = num_workers
42
+ self.boltz_options = boltz_options
43
+
44
+ def run(self, input_file: str) -> str:
45
+ """Run Boltz prediction on the input file.
46
+
47
+ Constructs the command using the input file, default number of workers,
48
+ and any additional options provided via `boltz_options`. If `--num_workers`
49
+ is specified in `boltz_options`, it overrides the default `num_workers`.
50
+
51
+ Args:
52
+ input_file: Path to the input file containing sequences
53
+
54
+ Returns:
55
+ Path to the output directory created by Boltz
56
+
57
+ Raises:
58
+ subprocess.CalledProcessError: If Boltz prediction fails
59
+ """
60
+ # Determine expected output directory name
61
+ input_base = os.path.splitext(os.path.basename(input_file))[0]
62
+ expected_output_dir = f"boltz_results_{input_base}"
63
+ logger.info(f"Expected output directory: {expected_output_dir}")
64
+
65
+ # Start building the command
66
+ cmd = ["boltz", "predict", input_file]
67
+
68
+ # Parse additional options if provided
69
+ additional_args = []
70
+ num_workers_in_opts = False
71
+ if self.boltz_options:
72
+ try:
73
+ parsed_opts = shlex.split(self.boltz_options)
74
+ additional_args.extend(parsed_opts)
75
+ if "--num_workers" in parsed_opts:
76
+ num_workers_in_opts = True
77
+ logger.info(
78
+ f"Using --num_workers from BOLTZ_OPTIONS: {self.boltz_options}"
79
+ )
80
+ except ValueError as e:
81
+ logger.error(f"Error parsing BOLTZ_OPTIONS '{self.boltz_options}': {e}")
82
+ # Decide if we should raise an error or proceed without options
83
+ # For now, proceed without the additional options
84
+ additional_args = [] # Clear potentially partially parsed args
85
+
86
+ # Add num_workers if not specified in options
87
+ if not num_workers_in_opts:
88
+ logger.info(f"Using default num_workers: {self.num_workers}")
89
+ cmd.extend(["--num_workers", str(self.num_workers)])
90
+
91
+ # Add the parsed additional arguments
92
+ cmd.extend(additional_args)
93
+
94
+ # Log the final command
95
+ # Use shlex.join for safer command logging, especially if paths/args have spaces
96
+ try:
97
+ safe_cmd_str = shlex.join(cmd)
98
+ logger.info(f"Running command: {safe_cmd_str}")
99
+ except AttributeError: # shlex.join is Python 3.8+
100
+ logger.info(f"Running command: {' '.join(cmd)}")
101
+
102
+ # Stream output in real-time
103
+ process = subprocess.Popen(
104
+ cmd,
105
+ stdout=subprocess.PIPE,
106
+ stderr=subprocess.STDOUT,
107
+ text=True,
108
+ bufsize=1,
109
+ )
110
+
111
+ stdout = process.stdout
112
+ if stdout:
113
+ for line in iter(stdout.readline, ""):
114
+ logger.info(f"BOLTZ: {line.rstrip()}")
115
+
116
+ # Wait for process to complete
117
+ return_code = process.wait()
118
+ if return_code != 0:
119
+ logger.error(f"Boltz prediction failed with exit code {return_code}")
120
+ raise subprocess.CalledProcessError(return_code, cmd)
121
+
122
+ logger.info(
123
+ f"Boltz prediction completed successfully. Output in {expected_output_dir}"
124
+ )
125
+ return expected_output_dir