expops 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- expops-0.1.3.dist-info/METADATA +826 -0
- expops-0.1.3.dist-info/RECORD +86 -0
- expops-0.1.3.dist-info/WHEEL +5 -0
- expops-0.1.3.dist-info/entry_points.txt +3 -0
- expops-0.1.3.dist-info/licenses/LICENSE +674 -0
- expops-0.1.3.dist-info/top_level.txt +1 -0
- mlops/__init__.py +0 -0
- mlops/__main__.py +11 -0
- mlops/_version.py +34 -0
- mlops/adapters/__init__.py +12 -0
- mlops/adapters/base.py +86 -0
- mlops/adapters/config_schema.py +89 -0
- mlops/adapters/custom/__init__.py +3 -0
- mlops/adapters/custom/custom_adapter.py +447 -0
- mlops/adapters/plugin_manager.py +113 -0
- mlops/adapters/sklearn/__init__.py +3 -0
- mlops/adapters/sklearn/adapter.py +94 -0
- mlops/cluster/__init__.py +3 -0
- mlops/cluster/controller.py +496 -0
- mlops/cluster/process_runner.py +91 -0
- mlops/cluster/providers.py +258 -0
- mlops/core/__init__.py +95 -0
- mlops/core/custom_model_base.py +38 -0
- mlops/core/dask_networkx_executor.py +1265 -0
- mlops/core/executor_worker.py +1239 -0
- mlops/core/experiment_tracker.py +81 -0
- mlops/core/graph_types.py +64 -0
- mlops/core/networkx_parser.py +135 -0
- mlops/core/payload_spill.py +278 -0
- mlops/core/pipeline_utils.py +162 -0
- mlops/core/process_hashing.py +216 -0
- mlops/core/step_state_manager.py +1298 -0
- mlops/core/step_system.py +956 -0
- mlops/core/workspace.py +99 -0
- mlops/environment/__init__.py +10 -0
- mlops/environment/base.py +43 -0
- mlops/environment/conda_manager.py +307 -0
- mlops/environment/factory.py +70 -0
- mlops/environment/pyenv_manager.py +146 -0
- mlops/environment/setup_env.py +31 -0
- mlops/environment/system_manager.py +66 -0
- mlops/environment/utils.py +105 -0
- mlops/environment/venv_manager.py +134 -0
- mlops/main.py +527 -0
- mlops/managers/project_manager.py +400 -0
- mlops/managers/reproducibility_manager.py +575 -0
- mlops/platform.py +996 -0
- mlops/reporting/__init__.py +16 -0
- mlops/reporting/context.py +187 -0
- mlops/reporting/entrypoint.py +292 -0
- mlops/reporting/kv_utils.py +77 -0
- mlops/reporting/registry.py +50 -0
- mlops/runtime/__init__.py +9 -0
- mlops/runtime/context.py +34 -0
- mlops/runtime/env_export.py +113 -0
- mlops/storage/__init__.py +12 -0
- mlops/storage/adapters/__init__.py +9 -0
- mlops/storage/adapters/gcp_kv_store.py +778 -0
- mlops/storage/adapters/gcs_object_store.py +96 -0
- mlops/storage/adapters/memory_store.py +240 -0
- mlops/storage/adapters/redis_store.py +438 -0
- mlops/storage/factory.py +199 -0
- mlops/storage/interfaces/__init__.py +6 -0
- mlops/storage/interfaces/kv_store.py +118 -0
- mlops/storage/path_utils.py +38 -0
- mlops/templates/premier-league/charts/plot_metrics.js +70 -0
- mlops/templates/premier-league/charts/plot_metrics.py +145 -0
- mlops/templates/premier-league/charts/requirements.txt +6 -0
- mlops/templates/premier-league/configs/cluster_config.yaml +13 -0
- mlops/templates/premier-league/configs/project_config.yaml +207 -0
- mlops/templates/premier-league/data/England CSV.csv +12154 -0
- mlops/templates/premier-league/models/premier_league_model.py +638 -0
- mlops/templates/premier-league/requirements.txt +8 -0
- mlops/templates/sklearn-basic/README.md +22 -0
- mlops/templates/sklearn-basic/charts/plot_metrics.py +85 -0
- mlops/templates/sklearn-basic/charts/requirements.txt +3 -0
- mlops/templates/sklearn-basic/configs/project_config.yaml +64 -0
- mlops/templates/sklearn-basic/data/train.csv +14 -0
- mlops/templates/sklearn-basic/models/model.py +62 -0
- mlops/templates/sklearn-basic/requirements.txt +10 -0
- mlops/web/__init__.py +3 -0
- mlops/web/server.py +585 -0
- mlops/web/ui/index.html +52 -0
- mlops/web/ui/mlops-charts.js +357 -0
- mlops/web/ui/script.js +1244 -0
- mlops/web/ui/styles.css +248 -0
|
@@ -0,0 +1,496 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import argparse
|
|
3
|
+
import os
|
|
4
|
+
import subprocess
|
|
5
|
+
import sys
|
|
6
|
+
import time
|
|
7
|
+
import yaml
|
|
8
|
+
import logging
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Optional, Dict, Any
|
|
11
|
+
|
|
12
|
+
def _default_workspace_dir() -> Path:
|
|
13
|
+
raw = os.environ.get("MLOPS_WORKSPACE_DIR")
|
|
14
|
+
if raw:
|
|
15
|
+
try:
|
|
16
|
+
return Path(raw).expanduser().resolve()
|
|
17
|
+
except Exception:
|
|
18
|
+
return Path(raw)
|
|
19
|
+
return Path.cwd()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# Source-checkout support: make <workspace>/src importable when present; otherwise assume installed package.
|
|
23
|
+
_WORKSPACE_ROOT = _default_workspace_dir()
|
|
24
|
+
_SRC_DIR = _WORKSPACE_ROOT / "src"
|
|
25
|
+
if _SRC_DIR.exists() and str(_SRC_DIR) not in sys.path:
|
|
26
|
+
sys.path.insert(0, str(_SRC_DIR))
|
|
27
|
+
|
|
28
|
+
class ClusterController:
|
|
29
|
+
def __init__(self, project_dir: Path):
|
|
30
|
+
self.project_dir = project_dir.resolve()
|
|
31
|
+
self.logs_dir = self.project_dir / "logs"
|
|
32
|
+
self.artifacts_dir = self.project_dir / "artifacts"
|
|
33
|
+
self.logger = logging.getLogger("CLUSTER_CONTROLLER")
|
|
34
|
+
self._ensure_dirs()
|
|
35
|
+
|
|
36
|
+
def _ensure_dirs(self) -> None:
|
|
37
|
+
self.logs_dir.mkdir(parents=True, exist_ok=True)
|
|
38
|
+
self.artifacts_dir.mkdir(parents=True, exist_ok=True)
|
|
39
|
+
|
|
40
|
+
def _build_kv_env(self, project_id: str) -> Dict[str, str]:
|
|
41
|
+
"""Build environment variables for KV backend from project or cluster config."""
|
|
42
|
+
env: Dict[str, str] = {}
|
|
43
|
+
try:
|
|
44
|
+
import yaml as _yaml
|
|
45
|
+
from mlops.runtime.env_export import export_kv_env
|
|
46
|
+
proj_cfg_path = self.project_dir / "projects" / project_id / "configs" / "project_config.yaml"
|
|
47
|
+
proj_cfg = {}
|
|
48
|
+
if proj_cfg_path.exists():
|
|
49
|
+
with open(proj_cfg_path, 'r') as _f:
|
|
50
|
+
proj_cfg = _yaml.safe_load(_f) or {}
|
|
51
|
+
# Navigate to model.parameters.cache.backend
|
|
52
|
+
try:
|
|
53
|
+
kv_cfg = ((proj_cfg.get('model') or {}).get('parameters') or {}).get('cache', {}) or {}
|
|
54
|
+
backend_cfg = kv_cfg.get('backend') if isinstance(kv_cfg, dict) else {}
|
|
55
|
+
if isinstance(backend_cfg, dict) and backend_cfg:
|
|
56
|
+
env.update(
|
|
57
|
+
export_kv_env(
|
|
58
|
+
backend_cfg,
|
|
59
|
+
workspace_root=self.project_dir,
|
|
60
|
+
project_root=(self.project_dir / "projects" / project_id),
|
|
61
|
+
)
|
|
62
|
+
)
|
|
63
|
+
except Exception:
|
|
64
|
+
pass
|
|
65
|
+
# Fallback to cluster kv_store block if present
|
|
66
|
+
if not env:
|
|
67
|
+
cfg_path = self.project_dir / "projects" / project_id / "configs" / "cluster_config.yaml"
|
|
68
|
+
cluster_cfg_local: Dict[str, Any] = {}
|
|
69
|
+
if cfg_path.exists():
|
|
70
|
+
with open(cfg_path, 'r') as _f:
|
|
71
|
+
cluster_cfg_local = _yaml.safe_load(_f) or {}
|
|
72
|
+
cluster_kv = cluster_cfg_local.get('kv_store') if isinstance(cluster_cfg_local, dict) else None
|
|
73
|
+
if isinstance(cluster_kv, dict):
|
|
74
|
+
backend = cluster_kv.get('backend', 'redis')
|
|
75
|
+
if backend == 'redis':
|
|
76
|
+
backend_cfg2 = {
|
|
77
|
+
"type": "redis",
|
|
78
|
+
"host": cluster_kv.get("host"),
|
|
79
|
+
"port": cluster_kv.get("port"),
|
|
80
|
+
"db": cluster_kv.get("db"),
|
|
81
|
+
"password": cluster_kv.get("password"),
|
|
82
|
+
}
|
|
83
|
+
env.update(
|
|
84
|
+
export_kv_env(
|
|
85
|
+
backend_cfg2,
|
|
86
|
+
workspace_root=self.project_dir,
|
|
87
|
+
project_root=(self.project_dir / "projects" / project_id),
|
|
88
|
+
)
|
|
89
|
+
)
|
|
90
|
+
elif backend == 'gcp':
|
|
91
|
+
backend_cfg2 = {
|
|
92
|
+
"type": "gcp",
|
|
93
|
+
"gcp_project": cluster_kv.get("gcp_project"),
|
|
94
|
+
"emulator_host": cluster_kv.get("emulator_host"),
|
|
95
|
+
"credentials_json": cluster_kv.get("credentials_json"),
|
|
96
|
+
}
|
|
97
|
+
env.update(
|
|
98
|
+
export_kv_env(
|
|
99
|
+
backend_cfg2,
|
|
100
|
+
workspace_root=self.project_dir,
|
|
101
|
+
project_root=(self.project_dir / "projects" / project_id),
|
|
102
|
+
)
|
|
103
|
+
)
|
|
104
|
+
except Exception:
|
|
105
|
+
env = {}
|
|
106
|
+
return env
|
|
107
|
+
|
|
108
|
+
def _load_executor_config(self, project_id: str) -> Dict[str, Any]:
|
|
109
|
+
"""Load executor config block from the project's project_config.yaml."""
|
|
110
|
+
try:
|
|
111
|
+
proj_cfg_path = self.project_dir / "projects" / project_id / "configs" / "project_config.yaml"
|
|
112
|
+
if not proj_cfg_path.exists():
|
|
113
|
+
return {}
|
|
114
|
+
with open(proj_cfg_path, 'r') as f:
|
|
115
|
+
proj_cfg = yaml.safe_load(f) or {}
|
|
116
|
+
executor_cfg = ((proj_cfg.get('model') or {}).get('parameters') or {}).get('executor', {}) or {}
|
|
117
|
+
return executor_cfg if isinstance(executor_cfg, dict) else {}
|
|
118
|
+
except Exception:
|
|
119
|
+
return {}
|
|
120
|
+
|
|
121
|
+
@staticmethod
|
|
122
|
+
def _extract_comm_compression(executor_cfg: Dict[str, Any]) -> Optional[str]:
|
|
123
|
+
if not isinstance(executor_cfg, dict):
|
|
124
|
+
return None
|
|
125
|
+
dask_cfg = executor_cfg.get('dask') or executor_cfg.get('dask_config') or {}
|
|
126
|
+
if not isinstance(dask_cfg, dict):
|
|
127
|
+
return None
|
|
128
|
+
comm_cfg = dask_cfg.get('comm') or {}
|
|
129
|
+
if isinstance(comm_cfg, dict):
|
|
130
|
+
comp = comm_cfg.get('compression') or comm_cfg.get('codec')
|
|
131
|
+
if comp:
|
|
132
|
+
return str(comp)
|
|
133
|
+
comp = dask_cfg.get('compression')
|
|
134
|
+
return str(comp) if comp else None
|
|
135
|
+
|
|
136
|
+
def run_project_with_dask(self, project_id: str,
|
|
137
|
+
cluster_provider: Optional[str] = None,
|
|
138
|
+
num_workers: int = 2,
|
|
139
|
+
provider_options: Optional[Dict[str, Any]] = None) -> None:
|
|
140
|
+
"""Run the project locally while provisioning a Dask cluster via a provider (slurm/ansible)."""
|
|
141
|
+
self.logger.info(f"Project directory: {self.project_dir}")
|
|
142
|
+
self.logger.info(f"Logs directory: {self.logs_dir}")
|
|
143
|
+
self.logger.info(f"Artifacts directory: {self.artifacts_dir}")
|
|
144
|
+
|
|
145
|
+
# Prepare KV env from config
|
|
146
|
+
kv_env = self._build_kv_env(project_id)
|
|
147
|
+
executor_cfg = self._load_executor_config(project_id)
|
|
148
|
+
comm_compression = self._extract_comm_compression(executor_cfg)
|
|
149
|
+
|
|
150
|
+
# Prepare a per-project interpreter using the environment manager
|
|
151
|
+
cache_base = Path.home() / ".cache" / "mlops-platform" / project_id
|
|
152
|
+
env_file = cache_base / "python_interpreter.txt"
|
|
153
|
+
env_file.parent.mkdir(parents=True, exist_ok=True)
|
|
154
|
+
self.logger.info(f"Ensuring project interpreter at {env_file}")
|
|
155
|
+
setup_cmd = [
|
|
156
|
+
sys.executable,
|
|
157
|
+
str(self.project_dir / "src" / "mlops" / "environment" / "setup_env.py"),
|
|
158
|
+
"--project-id", project_id,
|
|
159
|
+
"--project-dir", str(self.project_dir),
|
|
160
|
+
"--env-file", str(env_file),
|
|
161
|
+
]
|
|
162
|
+
# Log file for this run (unique per run)
|
|
163
|
+
env_log_hint = os.environ.get("MLOPS_RUN_LOG_FILE")
|
|
164
|
+
if env_log_hint:
|
|
165
|
+
proj_log_file = Path(env_log_hint)
|
|
166
|
+
proj_log_file.parent.mkdir(parents=True, exist_ok=True)
|
|
167
|
+
else:
|
|
168
|
+
proj_logs_dir = self.project_dir / "projects" / project_id / "logs"
|
|
169
|
+
proj_logs_dir.mkdir(parents=True, exist_ok=True)
|
|
170
|
+
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
|
171
|
+
proj_log_file = proj_logs_dir / f"{project_id}_{timestamp}.log"
|
|
172
|
+
os.environ["MLOPS_RUN_LOG_FILE"] = str(proj_log_file)
|
|
173
|
+
# Note: pointers are printed in main() before re-exec; avoid duplicate prints here
|
|
174
|
+
with open(proj_log_file, "a", encoding="utf-8") as lf:
|
|
175
|
+
res = subprocess.run(setup_cmd, stdout=lf, stderr=lf, text=True)
|
|
176
|
+
if res.returncode != 0:
|
|
177
|
+
self.logger.error("Project environment setup failed. See project log for details.")
|
|
178
|
+
raise RuntimeError("Failed to set up project environment")
|
|
179
|
+
|
|
180
|
+
# Read the project interpreter path produced by setup
|
|
181
|
+
if not env_file.exists():
|
|
182
|
+
raise RuntimeError(f"Missing environment interpreter file at {env_file}")
|
|
183
|
+
project_python = env_file.read_text().strip()
|
|
184
|
+
if not project_python:
|
|
185
|
+
raise RuntimeError("Empty interpreter path read from env file")
|
|
186
|
+
# Make sure required dependencies are present in the project environment
|
|
187
|
+
try:
|
|
188
|
+
# Upgrade pip and core build tools
|
|
189
|
+
with open(proj_log_file, "a", encoding="utf-8") as lf:
|
|
190
|
+
subprocess.run([project_python, "-m", "pip", "install", "--upgrade", "pip", "setuptools", "wheel"], check=False, stdout=lf, stderr=lf, text=True)
|
|
191
|
+
# Install our package (and extras if needed) into the project env using an absolute path
|
|
192
|
+
repo_path = str(self.project_dir)
|
|
193
|
+
if cluster_provider == 'slurm':
|
|
194
|
+
with open(proj_log_file, "a", encoding="utf-8") as lf:
|
|
195
|
+
subprocess.run([project_python, "-m", "pip", "install", "-e", f"{repo_path}[slurm]"], check=False, stdout=lf, stderr=lf, text=True)
|
|
196
|
+
else:
|
|
197
|
+
with open(proj_log_file, "a", encoding="utf-8") as lf:
|
|
198
|
+
subprocess.run([project_python, "-m", "pip", "install", "-e", repo_path], check=False, stdout=lf, stderr=lf, text=True)
|
|
199
|
+
# Ensure pydantic is available for adapters/config schema regardless of editable install success
|
|
200
|
+
try:
|
|
201
|
+
subprocess.run([project_python, "-c", "import pydantic"], check=True, capture_output=True, text=True)
|
|
202
|
+
except Exception:
|
|
203
|
+
with open(proj_log_file, "a", encoding="utf-8") as lf:
|
|
204
|
+
lf.write("Installing missing dependency: pydantic>=2\n")
|
|
205
|
+
res = subprocess.run([project_python, "-m", "pip", "install", "pydantic>=2"], check=False, stdout=lf, stderr=lf, text=True)
|
|
206
|
+
if res.returncode != 0:
|
|
207
|
+
lf.write("Retrying pydantic install with --user...\n")
|
|
208
|
+
subprocess.run([project_python, "-m", "pip", "install", "--user", "pydantic>=2"], check=False, stdout=lf, stderr=lf, text=True)
|
|
209
|
+
except Exception:
|
|
210
|
+
pass
|
|
211
|
+
|
|
212
|
+
# Verify core distributed dependencies; attempt to install if missing
|
|
213
|
+
def _ensure_importable(py: str, module: str) -> bool:
|
|
214
|
+
try:
|
|
215
|
+
subprocess.run([py, "-c", f"import {module}"], check=True, capture_output=True, text=True)
|
|
216
|
+
return True
|
|
217
|
+
except Exception:
|
|
218
|
+
return False
|
|
219
|
+
|
|
220
|
+
core_missing = []
|
|
221
|
+
for mod in ("dask", "distributed"):
|
|
222
|
+
if not _ensure_importable(project_python, mod):
|
|
223
|
+
core_missing.append(mod)
|
|
224
|
+
if core_missing:
|
|
225
|
+
self.logger.info(f"Installing missing core deps into project env: {', '.join(core_missing)}")
|
|
226
|
+
with open(proj_log_file, "a", encoding="utf-8") as lf:
|
|
227
|
+
subprocess.run([project_python, "-m", "pip", "install", *core_missing], check=False, stdout=lf, stderr=lf, text=True)
|
|
228
|
+
# For slurm provider ensure dask-jobqueue is present
|
|
229
|
+
if cluster_provider == 'slurm' and not _ensure_importable(project_python, "dask_jobqueue"):
|
|
230
|
+
self.logger.info("Installing missing SLURM extra: dask-jobqueue>=0.8.0")
|
|
231
|
+
with open(proj_log_file, "a", encoding="utf-8") as lf:
|
|
232
|
+
subprocess.run([project_python, "-m", "pip", "install", "dask-jobqueue>=0.8.0"], check=False, stdout=lf, stderr=lf, text=True)
|
|
233
|
+
# Final gate: give a clear error early if deps are still missing
|
|
234
|
+
if not _ensure_importable(project_python, "dask") or not _ensure_importable(project_python, "distributed"):
|
|
235
|
+
raise RuntimeError("Dask is not available in the project environment. Ensure connectivity or pin it in your project's requirements.txt.")
|
|
236
|
+
if cluster_provider == 'slurm' and not _ensure_importable(project_python, "dask_jobqueue"):
|
|
237
|
+
raise RuntimeError("dask-jobqueue is not available in the project environment. Ensure connectivity or pin it in your project's requirements.txt.")
|
|
238
|
+
|
|
239
|
+
# In-process import sanity (rare HPC oddities): ensure this running interpreter can import dask/distributed
|
|
240
|
+
try:
|
|
241
|
+
import dask as _d # type: ignore
|
|
242
|
+
import distributed as _dist # type: ignore
|
|
243
|
+
try:
|
|
244
|
+
_dv = getattr(_d, "__version__", "unknown")
|
|
245
|
+
except Exception:
|
|
246
|
+
_dv = "unknown"
|
|
247
|
+
try:
|
|
248
|
+
_disv = getattr(_dist, "__version__", "unknown")
|
|
249
|
+
except Exception:
|
|
250
|
+
_disv = "unknown"
|
|
251
|
+
self.logger.info(f"Dask import OK in-process: dask={_dv}, distributed={_disv}")
|
|
252
|
+
except Exception as _imp_e:
|
|
253
|
+
self.logger.warning(f"In-process import of dask failed: {_imp_e}. Patching sys.path using project env site-packages...")
|
|
254
|
+
try:
|
|
255
|
+
sp = subprocess.run(
|
|
256
|
+
[project_python, "-c", "import sysconfig; print(sysconfig.get_paths().get('purelib') or '')"],
|
|
257
|
+
capture_output=True,
|
|
258
|
+
text=True,
|
|
259
|
+
check=False,
|
|
260
|
+
)
|
|
261
|
+
site_pkgs = (sp.stdout or "").strip()
|
|
262
|
+
if site_pkgs:
|
|
263
|
+
if site_pkgs not in sys.path:
|
|
264
|
+
sys.path.insert(0, site_pkgs)
|
|
265
|
+
try:
|
|
266
|
+
import dask as _d2 # type: ignore
|
|
267
|
+
import distributed as _dist2 # type: ignore
|
|
268
|
+
_dv2 = getattr(_d2, "__version__", "unknown")
|
|
269
|
+
_disv2 = getattr(_dist2, "__version__", "unknown")
|
|
270
|
+
self.logger.info(f"Dask import OK after patch: dask={_dv2}, distributed={_disv2}")
|
|
271
|
+
except Exception as _imp_e2:
|
|
272
|
+
raise RuntimeError(f"Dask remains unimportable after site-packages patch: {_imp_e2}")
|
|
273
|
+
else:
|
|
274
|
+
raise RuntimeError("Unable to resolve site-packages path for project interpreter")
|
|
275
|
+
except Exception as _patch_e:
|
|
276
|
+
raise RuntimeError(f"Failed to ensure in-process dask importability: {_patch_e}")
|
|
277
|
+
|
|
278
|
+
# Prepare environment for running the pipeline
|
|
279
|
+
env = os.environ.copy()
|
|
280
|
+
if comm_compression:
|
|
281
|
+
env['DASK_DISTRIBUTED__COMM__COMPRESSION'] = comm_compression
|
|
282
|
+
else:
|
|
283
|
+
env.setdefault('DASK_DISTRIBUTED__COMM__COMPRESSION', 'zlib')
|
|
284
|
+
env['PYTHONUNBUFFERED'] = '1'
|
|
285
|
+
env['PYTHONPATH'] = f"{self.project_dir / 'src'}:{env.get('PYTHONPATH', '')}".rstrip(':')
|
|
286
|
+
for k, v in kv_env.items():
|
|
287
|
+
env[k] = v
|
|
288
|
+
|
|
289
|
+
provider_options = dict(provider_options or {})
|
|
290
|
+
if comm_compression:
|
|
291
|
+
provider_options.setdefault('comm_compression', comm_compression)
|
|
292
|
+
|
|
293
|
+
# If a provider is requested, start it here, run pipeline, then stop it
|
|
294
|
+
provider_obj = None
|
|
295
|
+
if cluster_provider:
|
|
296
|
+
self.logger.info(f"Starting provider: {cluster_provider} with {num_workers} workers")
|
|
297
|
+
try:
|
|
298
|
+
from mlops.cluster.providers import SlurmClusterProvider, AnsibleClusterProvider
|
|
299
|
+
if cluster_provider == 'slurm':
|
|
300
|
+
provider_obj = SlurmClusterProvider()
|
|
301
|
+
elif cluster_provider == 'ansible':
|
|
302
|
+
provider_obj = AnsibleClusterProvider()
|
|
303
|
+
else:
|
|
304
|
+
raise ValueError(f"Unknown provider: {cluster_provider}")
|
|
305
|
+
# Ensure workers use the same Python interpreter as the prepared project environment
|
|
306
|
+
provider_options = provider_options or {}
|
|
307
|
+
cluster_kwargs = dict((provider_options.get('cluster_kwargs') or {}))
|
|
308
|
+
cluster_kwargs.setdefault('python', project_python)
|
|
309
|
+
provider_options['cluster_kwargs'] = cluster_kwargs
|
|
310
|
+
_, addr = provider_obj.start(num_workers=num_workers, options=provider_options or {})
|
|
311
|
+
if not addr:
|
|
312
|
+
raise RuntimeError("Failed to obtain scheduler address from provider")
|
|
313
|
+
env['DASK_SCHEDULER_ADDRESS'] = str(addr)
|
|
314
|
+
self.logger.info(f"Using Dask scheduler at {addr}")
|
|
315
|
+
try:
|
|
316
|
+
print(f"Dask scheduler: {addr}")
|
|
317
|
+
except Exception:
|
|
318
|
+
pass
|
|
319
|
+
except Exception as e:
|
|
320
|
+
if provider_obj:
|
|
321
|
+
try:
|
|
322
|
+
provider_obj.stop()
|
|
323
|
+
except Exception:
|
|
324
|
+
pass
|
|
325
|
+
raise
|
|
326
|
+
|
|
327
|
+
# Run the pipeline using the prepared interpreter
|
|
328
|
+
self.logger.info(f"Running pipeline for project '{project_id}'")
|
|
329
|
+
run_cmd = [project_python, "-m", "mlops.main", "run", project_id]
|
|
330
|
+
# Ensure inner processes honor this unique run log
|
|
331
|
+
env['MLOPS_RUN_LOG_FILE'] = str(proj_log_file)
|
|
332
|
+
# Prevent recursion back into the controller when the CLI decides how to run
|
|
333
|
+
env['MLOPS_FORCE_LOCAL'] = '1'
|
|
334
|
+
with open(proj_log_file, "a", encoding="utf-8") as lf:
|
|
335
|
+
result = subprocess.run(run_cmd, env=env, stdout=lf, stderr=lf, text=True)
|
|
336
|
+
rc = result.returncode
|
|
337
|
+
# Try to extract and surface the run ID from the project log
|
|
338
|
+
try:
|
|
339
|
+
run_id_val = None
|
|
340
|
+
with open(proj_log_file, "r", encoding="utf-8") as rf:
|
|
341
|
+
for line in rf:
|
|
342
|
+
if "with run_id:" in line:
|
|
343
|
+
# e.g., Executing project 'my-project' with run_id: project-my-project-XXXX
|
|
344
|
+
idx = line.find("with run_id:")
|
|
345
|
+
if idx >= 0:
|
|
346
|
+
run_id_val = line[idx + len("with run_id:"):].strip().strip("'\"")
|
|
347
|
+
elif "ID: 'project-" in line:
|
|
348
|
+
# e.g., [NoOpTracker] Started run ... ID: 'project-...'
|
|
349
|
+
try:
|
|
350
|
+
start = line.index("ID: '") + 5
|
|
351
|
+
end = line.index("'", start)
|
|
352
|
+
run_id_val = line[start:end]
|
|
353
|
+
except Exception:
|
|
354
|
+
pass
|
|
355
|
+
if run_id_val:
|
|
356
|
+
print(f"Run ID: {run_id_val}")
|
|
357
|
+
except Exception:
|
|
358
|
+
pass
|
|
359
|
+
# Always attempt to stop provider
|
|
360
|
+
if provider_obj:
|
|
361
|
+
try:
|
|
362
|
+
provider_obj.stop()
|
|
363
|
+
except Exception as e:
|
|
364
|
+
self.logger.warning(f"Provider stop returned error: {e}")
|
|
365
|
+
if rc != 0:
|
|
366
|
+
raise SystemExit(rc)
|
|
367
|
+
self.logger.info("Project completed successfully via Dask scheduler.")
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
def _setup_logging(logs_dir: Path, project_id: str) -> None:
|
|
371
|
+
logs_dir.mkdir(parents=True, exist_ok=True)
|
|
372
|
+
log_file = logs_dir / f"cluster_controller_{project_id}.log"
|
|
373
|
+
root = logging.getLogger()
|
|
374
|
+
# Avoid duplicate handlers for repeated invocations
|
|
375
|
+
exists = False
|
|
376
|
+
for h in root.handlers:
|
|
377
|
+
try:
|
|
378
|
+
if isinstance(h, logging.FileHandler) and getattr(h, 'baseFilename', '') == str(log_file):
|
|
379
|
+
exists = True
|
|
380
|
+
break
|
|
381
|
+
except Exception:
|
|
382
|
+
continue
|
|
383
|
+
if not exists:
|
|
384
|
+
root.setLevel(logging.INFO)
|
|
385
|
+
fmt = logging.Formatter("%(asctime)s %(levelname)s [%(name)s] %(message)s")
|
|
386
|
+
fh = logging.FileHandler(str(log_file), encoding="utf-8")
|
|
387
|
+
fh.setLevel(logging.INFO)
|
|
388
|
+
fh.setFormatter(fmt)
|
|
389
|
+
root.addHandler(fh)
|
|
390
|
+
# Also echo warnings/errors to the console so critical info is visible in terminal
|
|
391
|
+
try:
|
|
392
|
+
has_stream = any(isinstance(h, logging.StreamHandler) for h in root.handlers)
|
|
393
|
+
if not has_stream:
|
|
394
|
+
ch = logging.StreamHandler(sys.stdout)
|
|
395
|
+
ch.setLevel(logging.WARNING)
|
|
396
|
+
ch.setFormatter(fmt)
|
|
397
|
+
root.addHandler(ch)
|
|
398
|
+
except Exception:
|
|
399
|
+
pass
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
def parse_args() -> argparse.Namespace:
|
|
403
|
+
parser = argparse.ArgumentParser(description="Cluster Controller")
|
|
404
|
+
parser.add_argument(
|
|
405
|
+
"--project-dir",
|
|
406
|
+
type=str,
|
|
407
|
+
default=str(_WORKSPACE_ROOT),
|
|
408
|
+
help="Base project directory on the cluster (defaults to repo root).",
|
|
409
|
+
)
|
|
410
|
+
parser.add_argument("--project-id", type=str, help="Project ID (e.g., my-project)")
|
|
411
|
+
parser.add_argument("--cluster-config", type=str, default=None, help="Path to cluster_config.yaml (defaults to project's configs folder)")
|
|
412
|
+
parser.add_argument("--log-file", type=str, default=None, help="Per-run project log file path (overrides auto timestamped path)")
|
|
413
|
+
return parser.parse_args()
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def main() -> None:
|
|
417
|
+
args = parse_args()
|
|
418
|
+
project_dir = Path(args.project_dir).resolve()
|
|
419
|
+
controller = ClusterController(project_dir)
|
|
420
|
+
if args.project_id:
|
|
421
|
+
_setup_logging(controller.logs_dir, args.project_id)
|
|
422
|
+
|
|
423
|
+
# Proactively print pointers so users can find logs even before re-exec (print once only)
|
|
424
|
+
try:
|
|
425
|
+
if not os.environ.get("MLOPS_LOG_POINTERS_PRINTED"):
|
|
426
|
+
proj_logs_dir = project_dir / "projects" / (args.project_id or "unknown") / "logs"
|
|
427
|
+
# Always use a fresh per-run log path unless explicitly overridden by --log-file
|
|
428
|
+
if args.log_file:
|
|
429
|
+
proj_log_file = Path(args.log_file)
|
|
430
|
+
else:
|
|
431
|
+
import time as _time
|
|
432
|
+
ts = _time.strftime("%Y%m%d_%H%M%S")
|
|
433
|
+
proj_log_file = proj_logs_dir / f"{args.project_id}_{ts}.log" if args.project_id else None
|
|
434
|
+
if proj_log_file:
|
|
435
|
+
proj_log_file.parent.mkdir(parents=True, exist_ok=True)
|
|
436
|
+
os.environ["MLOPS_RUN_LOG_FILE"] = str(proj_log_file)
|
|
437
|
+
ctrl_log_file = controller.logs_dir / f"cluster_controller_{args.project_id}.log" if args.project_id else None
|
|
438
|
+
print(f"Project: {args.project_id}", flush=True)
|
|
439
|
+
if proj_log_file:
|
|
440
|
+
print(f"Project log: {proj_log_file}", flush=True)
|
|
441
|
+
if ctrl_log_file:
|
|
442
|
+
print(f"Controller log: {ctrl_log_file}", flush=True)
|
|
443
|
+
os.environ["MLOPS_LOG_POINTERS_PRINTED"] = "1"
|
|
444
|
+
except Exception as _e:
|
|
445
|
+
logging.getLogger("CLUSTER_CONTROLLER").warning(f"Failed to print log pointers: {_e}")
|
|
446
|
+
|
|
447
|
+
if not args.project_id:
|
|
448
|
+
logging.getLogger("CLUSTER_CONTROLLER").error("Requires --project-id")
|
|
449
|
+
return
|
|
450
|
+
cache_base = Path.home() / ".cache" / "mlops-platform" / args.project_id
|
|
451
|
+
env_file = cache_base / "python_interpreter.txt"
|
|
452
|
+
env_file.parent.mkdir(parents=True, exist_ok=True)
|
|
453
|
+
try:
|
|
454
|
+
from mlops.core.pipeline_utils import setup_environment_and_write_interpreter
|
|
455
|
+
project_python = setup_environment_and_write_interpreter(project_dir, args.project_id, env_file)
|
|
456
|
+
except Exception as e:
|
|
457
|
+
logging.getLogger("CLUSTER_CONTROLLER").error(f"Failed to set up environment: {e}")
|
|
458
|
+
raise
|
|
459
|
+
if Path(sys.executable).resolve() != Path(project_python).resolve():
|
|
460
|
+
cmd = [project_python, str(Path(__file__).resolve()), "--project-dir", str(project_dir), "--project-id", args.project_id]
|
|
461
|
+
if getattr(args, "cluster_config", None):
|
|
462
|
+
cmd.extend(["--cluster-config", str(args.cluster_config)])
|
|
463
|
+
os.execv(cmd[0], cmd)
|
|
464
|
+
default_cluster_cfg_path = project_dir / "projects" / args.project_id / "configs" / "cluster_config.yaml"
|
|
465
|
+
cluster_cfg = {}
|
|
466
|
+
cfg_path = Path(args.cluster_config).resolve() if getattr(args, "cluster_config", None) else default_cluster_cfg_path
|
|
467
|
+
if cfg_path.exists():
|
|
468
|
+
try:
|
|
469
|
+
with open(cfg_path, 'r') as f:
|
|
470
|
+
cluster_cfg = yaml.safe_load(f) or {}
|
|
471
|
+
logging.getLogger("CLUSTER_CONTROLLER").info(f"Loaded cluster config from: {cfg_path}")
|
|
472
|
+
except Exception as e:
|
|
473
|
+
logging.getLogger("CLUSTER_CONTROLLER").warning(f"Failed to read cluster config at {cfg_path}: {e}")
|
|
474
|
+
cluster_cfg = {}
|
|
475
|
+
|
|
476
|
+
# Extract values strictly from cluster_config.yaml
|
|
477
|
+
cluster_provider = cluster_cfg.get('provider')
|
|
478
|
+
cfg_num_workers = cluster_cfg.get('num_workers')
|
|
479
|
+
try:
|
|
480
|
+
num_workers = int(cfg_num_workers) if cfg_num_workers is not None else 2
|
|
481
|
+
except Exception:
|
|
482
|
+
num_workers = 2
|
|
483
|
+
provider_options: Optional[Dict[str, Any]] = None
|
|
484
|
+
cfg_options = cluster_cfg.get('options') if isinstance(cluster_cfg, dict) else None
|
|
485
|
+
if isinstance(cfg_options, dict):
|
|
486
|
+
provider_options = dict(cfg_options)
|
|
487
|
+
controller.run_project_with_dask(
|
|
488
|
+
project_id=args.project_id,
|
|
489
|
+
cluster_provider=cluster_provider,
|
|
490
|
+
num_workers=num_workers,
|
|
491
|
+
provider_options=provider_options,
|
|
492
|
+
)
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
if __name__ == "__main__":
|
|
496
|
+
main()
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import argparse
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import yaml
|
|
7
|
+
|
|
8
|
+
def _workspace_root() -> Path:
|
|
9
|
+
raw = os.environ.get("MLOPS_WORKSPACE_DIR")
|
|
10
|
+
if raw:
|
|
11
|
+
try:
|
|
12
|
+
return Path(raw).expanduser().resolve()
|
|
13
|
+
except Exception:
|
|
14
|
+
return Path(raw)
|
|
15
|
+
return Path.cwd()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# Source-checkout support: when running on a shared filesystem, ensure <workspace>/src is importable.
|
|
19
|
+
WORKSPACE_ROOT = _workspace_root()
|
|
20
|
+
SRC_DIR = WORKSPACE_ROOT / "src"
|
|
21
|
+
if SRC_DIR.exists() and str(SRC_DIR) not in sys.path:
|
|
22
|
+
sys.path.insert(0, str(SRC_DIR))
|
|
23
|
+
|
|
24
|
+
from mlops.adapters.custom.custom_adapter import CustomModelAdapter
|
|
25
|
+
from mlops.adapters.config_schema import AdapterConfig
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def parse_args() -> argparse.Namespace:
|
|
29
|
+
p = argparse.ArgumentParser(description="Run a single process of a project pipeline")
|
|
30
|
+
p.add_argument("--project-id", required=True, help="Project ID, e.g., my-project")
|
|
31
|
+
p.add_argument("--process", required=True, help="Process name to execute (e.g., data_preparation)")
|
|
32
|
+
p.add_argument("--run-id", required=True, help="Shared run_id across processes")
|
|
33
|
+
p.add_argument("--config", help="Path to project_config.yaml (optional)")
|
|
34
|
+
return p.parse_args()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def load_project_config(project_id: str, config_path_arg: str | None) -> tuple[dict, Path]:
|
|
38
|
+
project_path = WORKSPACE_ROOT / "projects" / project_id
|
|
39
|
+
if config_path_arg:
|
|
40
|
+
config_path = Path(config_path_arg)
|
|
41
|
+
else:
|
|
42
|
+
config_path = project_path / "configs" / "project_config.yaml"
|
|
43
|
+
with open(config_path, "r") as f:
|
|
44
|
+
config = yaml.safe_load(f)
|
|
45
|
+
return config, project_path
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def main() -> None:
|
|
49
|
+
args = parse_args()
|
|
50
|
+
platform_config, project_path = load_project_config(args.project_id, args.config)
|
|
51
|
+
|
|
52
|
+
executor_cfg = platform_config.get("model", {}).get("parameters", {}).get("executor", {}) or {}
|
|
53
|
+
try:
|
|
54
|
+
slurm_cpus = int(os.environ.get("SLURM_CPUS_PER_TASK") or os.environ.get("SLURM_CPUS_ON_NODE") or 0)
|
|
55
|
+
except Exception:
|
|
56
|
+
slurm_cpus = 0
|
|
57
|
+
if slurm_cpus and (not isinstance(executor_cfg, dict) or not executor_cfg.get("n_workers")):
|
|
58
|
+
if not isinstance(executor_cfg, dict):
|
|
59
|
+
executor_cfg = {}
|
|
60
|
+
suggested = max(1, slurm_cpus - 1) if slurm_cpus > 2 else slurm_cpus
|
|
61
|
+
platform_config.setdefault("model", {}).setdefault("parameters", {}).setdefault("executor", {})["n_workers"] = suggested
|
|
62
|
+
|
|
63
|
+
adapter_config = AdapterConfig(**platform_config["model"])
|
|
64
|
+
|
|
65
|
+
adapter = CustomModelAdapter(
|
|
66
|
+
config=adapter_config,
|
|
67
|
+
python_interpreter=sys.executable,
|
|
68
|
+
project_path=project_path,
|
|
69
|
+
)
|
|
70
|
+
adapter.initialize()
|
|
71
|
+
|
|
72
|
+
training_data = platform_config.get("data", {}).get("sources", {}).get("training", {}).get("path")
|
|
73
|
+
training_data_path = None
|
|
74
|
+
if training_data:
|
|
75
|
+
p = Path(training_data)
|
|
76
|
+
if p.is_absolute():
|
|
77
|
+
training_data_path = p
|
|
78
|
+
else:
|
|
79
|
+
cand = (project_path / p)
|
|
80
|
+
training_data_path = cand if cand.exists() else (WORKSPACE_ROOT / p)
|
|
81
|
+
|
|
82
|
+
adapter.run(
|
|
83
|
+
data_paths={"training": training_data_path} if training_data_path else {},
|
|
84
|
+
run_id=args.run_id,
|
|
85
|
+
resume_from_process=args.process,
|
|
86
|
+
single_process=True,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
if __name__ == "__main__":
|
|
91
|
+
main()
|