expops 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. expops-0.1.3.dist-info/METADATA +826 -0
  2. expops-0.1.3.dist-info/RECORD +86 -0
  3. expops-0.1.3.dist-info/WHEEL +5 -0
  4. expops-0.1.3.dist-info/entry_points.txt +3 -0
  5. expops-0.1.3.dist-info/licenses/LICENSE +674 -0
  6. expops-0.1.3.dist-info/top_level.txt +1 -0
  7. mlops/__init__.py +0 -0
  8. mlops/__main__.py +11 -0
  9. mlops/_version.py +34 -0
  10. mlops/adapters/__init__.py +12 -0
  11. mlops/adapters/base.py +86 -0
  12. mlops/adapters/config_schema.py +89 -0
  13. mlops/adapters/custom/__init__.py +3 -0
  14. mlops/adapters/custom/custom_adapter.py +447 -0
  15. mlops/adapters/plugin_manager.py +113 -0
  16. mlops/adapters/sklearn/__init__.py +3 -0
  17. mlops/adapters/sklearn/adapter.py +94 -0
  18. mlops/cluster/__init__.py +3 -0
  19. mlops/cluster/controller.py +496 -0
  20. mlops/cluster/process_runner.py +91 -0
  21. mlops/cluster/providers.py +258 -0
  22. mlops/core/__init__.py +95 -0
  23. mlops/core/custom_model_base.py +38 -0
  24. mlops/core/dask_networkx_executor.py +1265 -0
  25. mlops/core/executor_worker.py +1239 -0
  26. mlops/core/experiment_tracker.py +81 -0
  27. mlops/core/graph_types.py +64 -0
  28. mlops/core/networkx_parser.py +135 -0
  29. mlops/core/payload_spill.py +278 -0
  30. mlops/core/pipeline_utils.py +162 -0
  31. mlops/core/process_hashing.py +216 -0
  32. mlops/core/step_state_manager.py +1298 -0
  33. mlops/core/step_system.py +956 -0
  34. mlops/core/workspace.py +99 -0
  35. mlops/environment/__init__.py +10 -0
  36. mlops/environment/base.py +43 -0
  37. mlops/environment/conda_manager.py +307 -0
  38. mlops/environment/factory.py +70 -0
  39. mlops/environment/pyenv_manager.py +146 -0
  40. mlops/environment/setup_env.py +31 -0
  41. mlops/environment/system_manager.py +66 -0
  42. mlops/environment/utils.py +105 -0
  43. mlops/environment/venv_manager.py +134 -0
  44. mlops/main.py +527 -0
  45. mlops/managers/project_manager.py +400 -0
  46. mlops/managers/reproducibility_manager.py +575 -0
  47. mlops/platform.py +996 -0
  48. mlops/reporting/__init__.py +16 -0
  49. mlops/reporting/context.py +187 -0
  50. mlops/reporting/entrypoint.py +292 -0
  51. mlops/reporting/kv_utils.py +77 -0
  52. mlops/reporting/registry.py +50 -0
  53. mlops/runtime/__init__.py +9 -0
  54. mlops/runtime/context.py +34 -0
  55. mlops/runtime/env_export.py +113 -0
  56. mlops/storage/__init__.py +12 -0
  57. mlops/storage/adapters/__init__.py +9 -0
  58. mlops/storage/adapters/gcp_kv_store.py +778 -0
  59. mlops/storage/adapters/gcs_object_store.py +96 -0
  60. mlops/storage/adapters/memory_store.py +240 -0
  61. mlops/storage/adapters/redis_store.py +438 -0
  62. mlops/storage/factory.py +199 -0
  63. mlops/storage/interfaces/__init__.py +6 -0
  64. mlops/storage/interfaces/kv_store.py +118 -0
  65. mlops/storage/path_utils.py +38 -0
  66. mlops/templates/premier-league/charts/plot_metrics.js +70 -0
  67. mlops/templates/premier-league/charts/plot_metrics.py +145 -0
  68. mlops/templates/premier-league/charts/requirements.txt +6 -0
  69. mlops/templates/premier-league/configs/cluster_config.yaml +13 -0
  70. mlops/templates/premier-league/configs/project_config.yaml +207 -0
  71. mlops/templates/premier-league/data/England CSV.csv +12154 -0
  72. mlops/templates/premier-league/models/premier_league_model.py +638 -0
  73. mlops/templates/premier-league/requirements.txt +8 -0
  74. mlops/templates/sklearn-basic/README.md +22 -0
  75. mlops/templates/sklearn-basic/charts/plot_metrics.py +85 -0
  76. mlops/templates/sklearn-basic/charts/requirements.txt +3 -0
  77. mlops/templates/sklearn-basic/configs/project_config.yaml +64 -0
  78. mlops/templates/sklearn-basic/data/train.csv +14 -0
  79. mlops/templates/sklearn-basic/models/model.py +62 -0
  80. mlops/templates/sklearn-basic/requirements.txt +10 -0
  81. mlops/web/__init__.py +3 -0
  82. mlops/web/server.py +585 -0
  83. mlops/web/ui/index.html +52 -0
  84. mlops/web/ui/mlops-charts.js +357 -0
  85. mlops/web/ui/script.js +1244 -0
  86. mlops/web/ui/styles.css +248 -0
@@ -0,0 +1,496 @@
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import os
4
+ import subprocess
5
+ import sys
6
+ import time
7
+ import yaml
8
+ import logging
9
+ from pathlib import Path
10
+ from typing import Optional, Dict, Any
11
+
12
+ def _default_workspace_dir() -> Path:
13
+ raw = os.environ.get("MLOPS_WORKSPACE_DIR")
14
+ if raw:
15
+ try:
16
+ return Path(raw).expanduser().resolve()
17
+ except Exception:
18
+ return Path(raw)
19
+ return Path.cwd()
20
+
21
+
22
+ # Source-checkout support: make <workspace>/src importable when present; otherwise assume installed package.
23
+ _WORKSPACE_ROOT = _default_workspace_dir()
24
+ _SRC_DIR = _WORKSPACE_ROOT / "src"
25
+ if _SRC_DIR.exists() and str(_SRC_DIR) not in sys.path:
26
+ sys.path.insert(0, str(_SRC_DIR))
27
+
28
+ class ClusterController:
29
+ def __init__(self, project_dir: Path):
30
+ self.project_dir = project_dir.resolve()
31
+ self.logs_dir = self.project_dir / "logs"
32
+ self.artifacts_dir = self.project_dir / "artifacts"
33
+ self.logger = logging.getLogger("CLUSTER_CONTROLLER")
34
+ self._ensure_dirs()
35
+
36
+ def _ensure_dirs(self) -> None:
37
+ self.logs_dir.mkdir(parents=True, exist_ok=True)
38
+ self.artifacts_dir.mkdir(parents=True, exist_ok=True)
39
+
40
+ def _build_kv_env(self, project_id: str) -> Dict[str, str]:
41
+ """Build environment variables for KV backend from project or cluster config."""
42
+ env: Dict[str, str] = {}
43
+ try:
44
+ import yaml as _yaml
45
+ from mlops.runtime.env_export import export_kv_env
46
+ proj_cfg_path = self.project_dir / "projects" / project_id / "configs" / "project_config.yaml"
47
+ proj_cfg = {}
48
+ if proj_cfg_path.exists():
49
+ with open(proj_cfg_path, 'r') as _f:
50
+ proj_cfg = _yaml.safe_load(_f) or {}
51
+ # Navigate to model.parameters.cache.backend
52
+ try:
53
+ kv_cfg = ((proj_cfg.get('model') or {}).get('parameters') or {}).get('cache', {}) or {}
54
+ backend_cfg = kv_cfg.get('backend') if isinstance(kv_cfg, dict) else {}
55
+ if isinstance(backend_cfg, dict) and backend_cfg:
56
+ env.update(
57
+ export_kv_env(
58
+ backend_cfg,
59
+ workspace_root=self.project_dir,
60
+ project_root=(self.project_dir / "projects" / project_id),
61
+ )
62
+ )
63
+ except Exception:
64
+ pass
65
+ # Fallback to cluster kv_store block if present
66
+ if not env:
67
+ cfg_path = self.project_dir / "projects" / project_id / "configs" / "cluster_config.yaml"
68
+ cluster_cfg_local: Dict[str, Any] = {}
69
+ if cfg_path.exists():
70
+ with open(cfg_path, 'r') as _f:
71
+ cluster_cfg_local = _yaml.safe_load(_f) or {}
72
+ cluster_kv = cluster_cfg_local.get('kv_store') if isinstance(cluster_cfg_local, dict) else None
73
+ if isinstance(cluster_kv, dict):
74
+ backend = cluster_kv.get('backend', 'redis')
75
+ if backend == 'redis':
76
+ backend_cfg2 = {
77
+ "type": "redis",
78
+ "host": cluster_kv.get("host"),
79
+ "port": cluster_kv.get("port"),
80
+ "db": cluster_kv.get("db"),
81
+ "password": cluster_kv.get("password"),
82
+ }
83
+ env.update(
84
+ export_kv_env(
85
+ backend_cfg2,
86
+ workspace_root=self.project_dir,
87
+ project_root=(self.project_dir / "projects" / project_id),
88
+ )
89
+ )
90
+ elif backend == 'gcp':
91
+ backend_cfg2 = {
92
+ "type": "gcp",
93
+ "gcp_project": cluster_kv.get("gcp_project"),
94
+ "emulator_host": cluster_kv.get("emulator_host"),
95
+ "credentials_json": cluster_kv.get("credentials_json"),
96
+ }
97
+ env.update(
98
+ export_kv_env(
99
+ backend_cfg2,
100
+ workspace_root=self.project_dir,
101
+ project_root=(self.project_dir / "projects" / project_id),
102
+ )
103
+ )
104
+ except Exception:
105
+ env = {}
106
+ return env
107
+
108
+ def _load_executor_config(self, project_id: str) -> Dict[str, Any]:
109
+ """Load executor config block from the project's project_config.yaml."""
110
+ try:
111
+ proj_cfg_path = self.project_dir / "projects" / project_id / "configs" / "project_config.yaml"
112
+ if not proj_cfg_path.exists():
113
+ return {}
114
+ with open(proj_cfg_path, 'r') as f:
115
+ proj_cfg = yaml.safe_load(f) or {}
116
+ executor_cfg = ((proj_cfg.get('model') or {}).get('parameters') or {}).get('executor', {}) or {}
117
+ return executor_cfg if isinstance(executor_cfg, dict) else {}
118
+ except Exception:
119
+ return {}
120
+
121
+ @staticmethod
122
+ def _extract_comm_compression(executor_cfg: Dict[str, Any]) -> Optional[str]:
123
+ if not isinstance(executor_cfg, dict):
124
+ return None
125
+ dask_cfg = executor_cfg.get('dask') or executor_cfg.get('dask_config') or {}
126
+ if not isinstance(dask_cfg, dict):
127
+ return None
128
+ comm_cfg = dask_cfg.get('comm') or {}
129
+ if isinstance(comm_cfg, dict):
130
+ comp = comm_cfg.get('compression') or comm_cfg.get('codec')
131
+ if comp:
132
+ return str(comp)
133
+ comp = dask_cfg.get('compression')
134
+ return str(comp) if comp else None
135
+
136
+ def run_project_with_dask(self, project_id: str,
137
+ cluster_provider: Optional[str] = None,
138
+ num_workers: int = 2,
139
+ provider_options: Optional[Dict[str, Any]] = None) -> None:
140
+ """Run the project locally while provisioning a Dask cluster via a provider (slurm/ansible)."""
141
+ self.logger.info(f"Project directory: {self.project_dir}")
142
+ self.logger.info(f"Logs directory: {self.logs_dir}")
143
+ self.logger.info(f"Artifacts directory: {self.artifacts_dir}")
144
+
145
+ # Prepare KV env from config
146
+ kv_env = self._build_kv_env(project_id)
147
+ executor_cfg = self._load_executor_config(project_id)
148
+ comm_compression = self._extract_comm_compression(executor_cfg)
149
+
150
+ # Prepare a per-project interpreter using the environment manager
151
+ cache_base = Path.home() / ".cache" / "mlops-platform" / project_id
152
+ env_file = cache_base / "python_interpreter.txt"
153
+ env_file.parent.mkdir(parents=True, exist_ok=True)
154
+ self.logger.info(f"Ensuring project interpreter at {env_file}")
155
+ setup_cmd = [
156
+ sys.executable,
157
+ str(self.project_dir / "src" / "mlops" / "environment" / "setup_env.py"),
158
+ "--project-id", project_id,
159
+ "--project-dir", str(self.project_dir),
160
+ "--env-file", str(env_file),
161
+ ]
162
+ # Log file for this run (unique per run)
163
+ env_log_hint = os.environ.get("MLOPS_RUN_LOG_FILE")
164
+ if env_log_hint:
165
+ proj_log_file = Path(env_log_hint)
166
+ proj_log_file.parent.mkdir(parents=True, exist_ok=True)
167
+ else:
168
+ proj_logs_dir = self.project_dir / "projects" / project_id / "logs"
169
+ proj_logs_dir.mkdir(parents=True, exist_ok=True)
170
+ timestamp = time.strftime("%Y%m%d_%H%M%S")
171
+ proj_log_file = proj_logs_dir / f"{project_id}_{timestamp}.log"
172
+ os.environ["MLOPS_RUN_LOG_FILE"] = str(proj_log_file)
173
+ # Note: pointers are printed in main() before re-exec; avoid duplicate prints here
174
+ with open(proj_log_file, "a", encoding="utf-8") as lf:
175
+ res = subprocess.run(setup_cmd, stdout=lf, stderr=lf, text=True)
176
+ if res.returncode != 0:
177
+ self.logger.error("Project environment setup failed. See project log for details.")
178
+ raise RuntimeError("Failed to set up project environment")
179
+
180
+ # Read the project interpreter path produced by setup
181
+ if not env_file.exists():
182
+ raise RuntimeError(f"Missing environment interpreter file at {env_file}")
183
+ project_python = env_file.read_text().strip()
184
+ if not project_python:
185
+ raise RuntimeError("Empty interpreter path read from env file")
186
+ # Make sure required dependencies are present in the project environment
187
+ try:
188
+ # Upgrade pip and core build tools
189
+ with open(proj_log_file, "a", encoding="utf-8") as lf:
190
+ subprocess.run([project_python, "-m", "pip", "install", "--upgrade", "pip", "setuptools", "wheel"], check=False, stdout=lf, stderr=lf, text=True)
191
+ # Install our package (and extras if needed) into the project env using an absolute path
192
+ repo_path = str(self.project_dir)
193
+ if cluster_provider == 'slurm':
194
+ with open(proj_log_file, "a", encoding="utf-8") as lf:
195
+ subprocess.run([project_python, "-m", "pip", "install", "-e", f"{repo_path}[slurm]"], check=False, stdout=lf, stderr=lf, text=True)
196
+ else:
197
+ with open(proj_log_file, "a", encoding="utf-8") as lf:
198
+ subprocess.run([project_python, "-m", "pip", "install", "-e", repo_path], check=False, stdout=lf, stderr=lf, text=True)
199
+ # Ensure pydantic is available for adapters/config schema regardless of editable install success
200
+ try:
201
+ subprocess.run([project_python, "-c", "import pydantic"], check=True, capture_output=True, text=True)
202
+ except Exception:
203
+ with open(proj_log_file, "a", encoding="utf-8") as lf:
204
+ lf.write("Installing missing dependency: pydantic>=2\n")
205
+ res = subprocess.run([project_python, "-m", "pip", "install", "pydantic>=2"], check=False, stdout=lf, stderr=lf, text=True)
206
+ if res.returncode != 0:
207
+ lf.write("Retrying pydantic install with --user...\n")
208
+ subprocess.run([project_python, "-m", "pip", "install", "--user", "pydantic>=2"], check=False, stdout=lf, stderr=lf, text=True)
209
+ except Exception:
210
+ pass
211
+
212
+ # Verify core distributed dependencies; attempt to install if missing
213
+ def _ensure_importable(py: str, module: str) -> bool:
214
+ try:
215
+ subprocess.run([py, "-c", f"import {module}"], check=True, capture_output=True, text=True)
216
+ return True
217
+ except Exception:
218
+ return False
219
+
220
+ core_missing = []
221
+ for mod in ("dask", "distributed"):
222
+ if not _ensure_importable(project_python, mod):
223
+ core_missing.append(mod)
224
+ if core_missing:
225
+ self.logger.info(f"Installing missing core deps into project env: {', '.join(core_missing)}")
226
+ with open(proj_log_file, "a", encoding="utf-8") as lf:
227
+ subprocess.run([project_python, "-m", "pip", "install", *core_missing], check=False, stdout=lf, stderr=lf, text=True)
228
+ # For slurm provider ensure dask-jobqueue is present
229
+ if cluster_provider == 'slurm' and not _ensure_importable(project_python, "dask_jobqueue"):
230
+ self.logger.info("Installing missing SLURM extra: dask-jobqueue>=0.8.0")
231
+ with open(proj_log_file, "a", encoding="utf-8") as lf:
232
+ subprocess.run([project_python, "-m", "pip", "install", "dask-jobqueue>=0.8.0"], check=False, stdout=lf, stderr=lf, text=True)
233
+ # Final gate: give a clear error early if deps are still missing
234
+ if not _ensure_importable(project_python, "dask") or not _ensure_importable(project_python, "distributed"):
235
+ raise RuntimeError("Dask is not available in the project environment. Ensure connectivity or pin it in your project's requirements.txt.")
236
+ if cluster_provider == 'slurm' and not _ensure_importable(project_python, "dask_jobqueue"):
237
+ raise RuntimeError("dask-jobqueue is not available in the project environment. Ensure connectivity or pin it in your project's requirements.txt.")
238
+
239
+ # In-process import sanity (rare HPC oddities): ensure this running interpreter can import dask/distributed
240
+ try:
241
+ import dask as _d # type: ignore
242
+ import distributed as _dist # type: ignore
243
+ try:
244
+ _dv = getattr(_d, "__version__", "unknown")
245
+ except Exception:
246
+ _dv = "unknown"
247
+ try:
248
+ _disv = getattr(_dist, "__version__", "unknown")
249
+ except Exception:
250
+ _disv = "unknown"
251
+ self.logger.info(f"Dask import OK in-process: dask={_dv}, distributed={_disv}")
252
+ except Exception as _imp_e:
253
+ self.logger.warning(f"In-process import of dask failed: {_imp_e}. Patching sys.path using project env site-packages...")
254
+ try:
255
+ sp = subprocess.run(
256
+ [project_python, "-c", "import sysconfig; print(sysconfig.get_paths().get('purelib') or '')"],
257
+ capture_output=True,
258
+ text=True,
259
+ check=False,
260
+ )
261
+ site_pkgs = (sp.stdout or "").strip()
262
+ if site_pkgs:
263
+ if site_pkgs not in sys.path:
264
+ sys.path.insert(0, site_pkgs)
265
+ try:
266
+ import dask as _d2 # type: ignore
267
+ import distributed as _dist2 # type: ignore
268
+ _dv2 = getattr(_d2, "__version__", "unknown")
269
+ _disv2 = getattr(_dist2, "__version__", "unknown")
270
+ self.logger.info(f"Dask import OK after patch: dask={_dv2}, distributed={_disv2}")
271
+ except Exception as _imp_e2:
272
+ raise RuntimeError(f"Dask remains unimportable after site-packages patch: {_imp_e2}")
273
+ else:
274
+ raise RuntimeError("Unable to resolve site-packages path for project interpreter")
275
+ except Exception as _patch_e:
276
+ raise RuntimeError(f"Failed to ensure in-process dask importability: {_patch_e}")
277
+
278
+ # Prepare environment for running the pipeline
279
+ env = os.environ.copy()
280
+ if comm_compression:
281
+ env['DASK_DISTRIBUTED__COMM__COMPRESSION'] = comm_compression
282
+ else:
283
+ env.setdefault('DASK_DISTRIBUTED__COMM__COMPRESSION', 'zlib')
284
+ env['PYTHONUNBUFFERED'] = '1'
285
+ env['PYTHONPATH'] = f"{self.project_dir / 'src'}:{env.get('PYTHONPATH', '')}".rstrip(':')
286
+ for k, v in kv_env.items():
287
+ env[k] = v
288
+
289
+ provider_options = dict(provider_options or {})
290
+ if comm_compression:
291
+ provider_options.setdefault('comm_compression', comm_compression)
292
+
293
+ # If a provider is requested, start it here, run pipeline, then stop it
294
+ provider_obj = None
295
+ if cluster_provider:
296
+ self.logger.info(f"Starting provider: {cluster_provider} with {num_workers} workers")
297
+ try:
298
+ from mlops.cluster.providers import SlurmClusterProvider, AnsibleClusterProvider
299
+ if cluster_provider == 'slurm':
300
+ provider_obj = SlurmClusterProvider()
301
+ elif cluster_provider == 'ansible':
302
+ provider_obj = AnsibleClusterProvider()
303
+ else:
304
+ raise ValueError(f"Unknown provider: {cluster_provider}")
305
+ # Ensure workers use the same Python interpreter as the prepared project environment
306
+ provider_options = provider_options or {}
307
+ cluster_kwargs = dict((provider_options.get('cluster_kwargs') or {}))
308
+ cluster_kwargs.setdefault('python', project_python)
309
+ provider_options['cluster_kwargs'] = cluster_kwargs
310
+ _, addr = provider_obj.start(num_workers=num_workers, options=provider_options or {})
311
+ if not addr:
312
+ raise RuntimeError("Failed to obtain scheduler address from provider")
313
+ env['DASK_SCHEDULER_ADDRESS'] = str(addr)
314
+ self.logger.info(f"Using Dask scheduler at {addr}")
315
+ try:
316
+ print(f"Dask scheduler: {addr}")
317
+ except Exception:
318
+ pass
319
+ except Exception as e:
320
+ if provider_obj:
321
+ try:
322
+ provider_obj.stop()
323
+ except Exception:
324
+ pass
325
+ raise
326
+
327
+ # Run the pipeline using the prepared interpreter
328
+ self.logger.info(f"Running pipeline for project '{project_id}'")
329
+ run_cmd = [project_python, "-m", "mlops.main", "run", project_id]
330
+ # Ensure inner processes honor this unique run log
331
+ env['MLOPS_RUN_LOG_FILE'] = str(proj_log_file)
332
+ # Prevent recursion back into the controller when the CLI decides how to run
333
+ env['MLOPS_FORCE_LOCAL'] = '1'
334
+ with open(proj_log_file, "a", encoding="utf-8") as lf:
335
+ result = subprocess.run(run_cmd, env=env, stdout=lf, stderr=lf, text=True)
336
+ rc = result.returncode
337
+ # Try to extract and surface the run ID from the project log
338
+ try:
339
+ run_id_val = None
340
+ with open(proj_log_file, "r", encoding="utf-8") as rf:
341
+ for line in rf:
342
+ if "with run_id:" in line:
343
+ # e.g., Executing project 'my-project' with run_id: project-my-project-XXXX
344
+ idx = line.find("with run_id:")
345
+ if idx >= 0:
346
+ run_id_val = line[idx + len("with run_id:"):].strip().strip("'\"")
347
+ elif "ID: 'project-" in line:
348
+ # e.g., [NoOpTracker] Started run ... ID: 'project-...'
349
+ try:
350
+ start = line.index("ID: '") + 5
351
+ end = line.index("'", start)
352
+ run_id_val = line[start:end]
353
+ except Exception:
354
+ pass
355
+ if run_id_val:
356
+ print(f"Run ID: {run_id_val}")
357
+ except Exception:
358
+ pass
359
+ # Always attempt to stop provider
360
+ if provider_obj:
361
+ try:
362
+ provider_obj.stop()
363
+ except Exception as e:
364
+ self.logger.warning(f"Provider stop returned error: {e}")
365
+ if rc != 0:
366
+ raise SystemExit(rc)
367
+ self.logger.info("Project completed successfully via Dask scheduler.")
368
+
369
+
370
+ def _setup_logging(logs_dir: Path, project_id: str) -> None:
371
+ logs_dir.mkdir(parents=True, exist_ok=True)
372
+ log_file = logs_dir / f"cluster_controller_{project_id}.log"
373
+ root = logging.getLogger()
374
+ # Avoid duplicate handlers for repeated invocations
375
+ exists = False
376
+ for h in root.handlers:
377
+ try:
378
+ if isinstance(h, logging.FileHandler) and getattr(h, 'baseFilename', '') == str(log_file):
379
+ exists = True
380
+ break
381
+ except Exception:
382
+ continue
383
+ if not exists:
384
+ root.setLevel(logging.INFO)
385
+ fmt = logging.Formatter("%(asctime)s %(levelname)s [%(name)s] %(message)s")
386
+ fh = logging.FileHandler(str(log_file), encoding="utf-8")
387
+ fh.setLevel(logging.INFO)
388
+ fh.setFormatter(fmt)
389
+ root.addHandler(fh)
390
+ # Also echo warnings/errors to the console so critical info is visible in terminal
391
+ try:
392
+ has_stream = any(isinstance(h, logging.StreamHandler) for h in root.handlers)
393
+ if not has_stream:
394
+ ch = logging.StreamHandler(sys.stdout)
395
+ ch.setLevel(logging.WARNING)
396
+ ch.setFormatter(fmt)
397
+ root.addHandler(ch)
398
+ except Exception:
399
+ pass
400
+
401
+
402
+ def parse_args() -> argparse.Namespace:
403
+ parser = argparse.ArgumentParser(description="Cluster Controller")
404
+ parser.add_argument(
405
+ "--project-dir",
406
+ type=str,
407
+ default=str(_WORKSPACE_ROOT),
408
+ help="Base project directory on the cluster (defaults to repo root).",
409
+ )
410
+ parser.add_argument("--project-id", type=str, help="Project ID (e.g., my-project)")
411
+ parser.add_argument("--cluster-config", type=str, default=None, help="Path to cluster_config.yaml (defaults to project's configs folder)")
412
+ parser.add_argument("--log-file", type=str, default=None, help="Per-run project log file path (overrides auto timestamped path)")
413
+ return parser.parse_args()
414
+
415
+
416
+ def main() -> None:
417
+ args = parse_args()
418
+ project_dir = Path(args.project_dir).resolve()
419
+ controller = ClusterController(project_dir)
420
+ if args.project_id:
421
+ _setup_logging(controller.logs_dir, args.project_id)
422
+
423
+ # Proactively print pointers so users can find logs even before re-exec (print once only)
424
+ try:
425
+ if not os.environ.get("MLOPS_LOG_POINTERS_PRINTED"):
426
+ proj_logs_dir = project_dir / "projects" / (args.project_id or "unknown") / "logs"
427
+ # Always use a fresh per-run log path unless explicitly overridden by --log-file
428
+ if args.log_file:
429
+ proj_log_file = Path(args.log_file)
430
+ else:
431
+ import time as _time
432
+ ts = _time.strftime("%Y%m%d_%H%M%S")
433
+ proj_log_file = proj_logs_dir / f"{args.project_id}_{ts}.log" if args.project_id else None
434
+ if proj_log_file:
435
+ proj_log_file.parent.mkdir(parents=True, exist_ok=True)
436
+ os.environ["MLOPS_RUN_LOG_FILE"] = str(proj_log_file)
437
+ ctrl_log_file = controller.logs_dir / f"cluster_controller_{args.project_id}.log" if args.project_id else None
438
+ print(f"Project: {args.project_id}", flush=True)
439
+ if proj_log_file:
440
+ print(f"Project log: {proj_log_file}", flush=True)
441
+ if ctrl_log_file:
442
+ print(f"Controller log: {ctrl_log_file}", flush=True)
443
+ os.environ["MLOPS_LOG_POINTERS_PRINTED"] = "1"
444
+ except Exception as _e:
445
+ logging.getLogger("CLUSTER_CONTROLLER").warning(f"Failed to print log pointers: {_e}")
446
+
447
+ if not args.project_id:
448
+ logging.getLogger("CLUSTER_CONTROLLER").error("Requires --project-id")
449
+ return
450
+ cache_base = Path.home() / ".cache" / "mlops-platform" / args.project_id
451
+ env_file = cache_base / "python_interpreter.txt"
452
+ env_file.parent.mkdir(parents=True, exist_ok=True)
453
+ try:
454
+ from mlops.core.pipeline_utils import setup_environment_and_write_interpreter
455
+ project_python = setup_environment_and_write_interpreter(project_dir, args.project_id, env_file)
456
+ except Exception as e:
457
+ logging.getLogger("CLUSTER_CONTROLLER").error(f"Failed to set up environment: {e}")
458
+ raise
459
+ if Path(sys.executable).resolve() != Path(project_python).resolve():
460
+ cmd = [project_python, str(Path(__file__).resolve()), "--project-dir", str(project_dir), "--project-id", args.project_id]
461
+ if getattr(args, "cluster_config", None):
462
+ cmd.extend(["--cluster-config", str(args.cluster_config)])
463
+ os.execv(cmd[0], cmd)
464
+ default_cluster_cfg_path = project_dir / "projects" / args.project_id / "configs" / "cluster_config.yaml"
465
+ cluster_cfg = {}
466
+ cfg_path = Path(args.cluster_config).resolve() if getattr(args, "cluster_config", None) else default_cluster_cfg_path
467
+ if cfg_path.exists():
468
+ try:
469
+ with open(cfg_path, 'r') as f:
470
+ cluster_cfg = yaml.safe_load(f) or {}
471
+ logging.getLogger("CLUSTER_CONTROLLER").info(f"Loaded cluster config from: {cfg_path}")
472
+ except Exception as e:
473
+ logging.getLogger("CLUSTER_CONTROLLER").warning(f"Failed to read cluster config at {cfg_path}: {e}")
474
+ cluster_cfg = {}
475
+
476
+ # Extract values strictly from cluster_config.yaml
477
+ cluster_provider = cluster_cfg.get('provider')
478
+ cfg_num_workers = cluster_cfg.get('num_workers')
479
+ try:
480
+ num_workers = int(cfg_num_workers) if cfg_num_workers is not None else 2
481
+ except Exception:
482
+ num_workers = 2
483
+ provider_options: Optional[Dict[str, Any]] = None
484
+ cfg_options = cluster_cfg.get('options') if isinstance(cluster_cfg, dict) else None
485
+ if isinstance(cfg_options, dict):
486
+ provider_options = dict(cfg_options)
487
+ controller.run_project_with_dask(
488
+ project_id=args.project_id,
489
+ cluster_provider=cluster_provider,
490
+ num_workers=num_workers,
491
+ provider_options=provider_options,
492
+ )
493
+
494
+
495
+ if __name__ == "__main__":
496
+ main()
@@ -0,0 +1,91 @@
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import os
4
+ import sys
5
+ from pathlib import Path
6
+ import yaml
7
+
8
+ def _workspace_root() -> Path:
9
+ raw = os.environ.get("MLOPS_WORKSPACE_DIR")
10
+ if raw:
11
+ try:
12
+ return Path(raw).expanduser().resolve()
13
+ except Exception:
14
+ return Path(raw)
15
+ return Path.cwd()
16
+
17
+
18
+ # Source-checkout support: when running on a shared filesystem, ensure <workspace>/src is importable.
19
+ WORKSPACE_ROOT = _workspace_root()
20
+ SRC_DIR = WORKSPACE_ROOT / "src"
21
+ if SRC_DIR.exists() and str(SRC_DIR) not in sys.path:
22
+ sys.path.insert(0, str(SRC_DIR))
23
+
24
+ from mlops.adapters.custom.custom_adapter import CustomModelAdapter
25
+ from mlops.adapters.config_schema import AdapterConfig
26
+
27
+
28
+ def parse_args() -> argparse.Namespace:
29
+ p = argparse.ArgumentParser(description="Run a single process of a project pipeline")
30
+ p.add_argument("--project-id", required=True, help="Project ID, e.g., my-project")
31
+ p.add_argument("--process", required=True, help="Process name to execute (e.g., data_preparation)")
32
+ p.add_argument("--run-id", required=True, help="Shared run_id across processes")
33
+ p.add_argument("--config", help="Path to project_config.yaml (optional)")
34
+ return p.parse_args()
35
+
36
+
37
+ def load_project_config(project_id: str, config_path_arg: str | None) -> tuple[dict, Path]:
38
+ project_path = WORKSPACE_ROOT / "projects" / project_id
39
+ if config_path_arg:
40
+ config_path = Path(config_path_arg)
41
+ else:
42
+ config_path = project_path / "configs" / "project_config.yaml"
43
+ with open(config_path, "r") as f:
44
+ config = yaml.safe_load(f)
45
+ return config, project_path
46
+
47
+
48
+ def main() -> None:
49
+ args = parse_args()
50
+ platform_config, project_path = load_project_config(args.project_id, args.config)
51
+
52
+ executor_cfg = platform_config.get("model", {}).get("parameters", {}).get("executor", {}) or {}
53
+ try:
54
+ slurm_cpus = int(os.environ.get("SLURM_CPUS_PER_TASK") or os.environ.get("SLURM_CPUS_ON_NODE") or 0)
55
+ except Exception:
56
+ slurm_cpus = 0
57
+ if slurm_cpus and (not isinstance(executor_cfg, dict) or not executor_cfg.get("n_workers")):
58
+ if not isinstance(executor_cfg, dict):
59
+ executor_cfg = {}
60
+ suggested = max(1, slurm_cpus - 1) if slurm_cpus > 2 else slurm_cpus
61
+ platform_config.setdefault("model", {}).setdefault("parameters", {}).setdefault("executor", {})["n_workers"] = suggested
62
+
63
+ adapter_config = AdapterConfig(**platform_config["model"])
64
+
65
+ adapter = CustomModelAdapter(
66
+ config=adapter_config,
67
+ python_interpreter=sys.executable,
68
+ project_path=project_path,
69
+ )
70
+ adapter.initialize()
71
+
72
+ training_data = platform_config.get("data", {}).get("sources", {}).get("training", {}).get("path")
73
+ training_data_path = None
74
+ if training_data:
75
+ p = Path(training_data)
76
+ if p.is_absolute():
77
+ training_data_path = p
78
+ else:
79
+ cand = (project_path / p)
80
+ training_data_path = cand if cand.exists() else (WORKSPACE_ROOT / p)
81
+
82
+ adapter.run(
83
+ data_paths={"training": training_data_path} if training_data_path else {},
84
+ run_id=args.run_id,
85
+ resume_from_process=args.process,
86
+ single_process=True,
87
+ )
88
+
89
+
90
+ if __name__ == "__main__":
91
+ main()