expops 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- expops-0.1.3.dist-info/METADATA +826 -0
- expops-0.1.3.dist-info/RECORD +86 -0
- expops-0.1.3.dist-info/WHEEL +5 -0
- expops-0.1.3.dist-info/entry_points.txt +3 -0
- expops-0.1.3.dist-info/licenses/LICENSE +674 -0
- expops-0.1.3.dist-info/top_level.txt +1 -0
- mlops/__init__.py +0 -0
- mlops/__main__.py +11 -0
- mlops/_version.py +34 -0
- mlops/adapters/__init__.py +12 -0
- mlops/adapters/base.py +86 -0
- mlops/adapters/config_schema.py +89 -0
- mlops/adapters/custom/__init__.py +3 -0
- mlops/adapters/custom/custom_adapter.py +447 -0
- mlops/adapters/plugin_manager.py +113 -0
- mlops/adapters/sklearn/__init__.py +3 -0
- mlops/adapters/sklearn/adapter.py +94 -0
- mlops/cluster/__init__.py +3 -0
- mlops/cluster/controller.py +496 -0
- mlops/cluster/process_runner.py +91 -0
- mlops/cluster/providers.py +258 -0
- mlops/core/__init__.py +95 -0
- mlops/core/custom_model_base.py +38 -0
- mlops/core/dask_networkx_executor.py +1265 -0
- mlops/core/executor_worker.py +1239 -0
- mlops/core/experiment_tracker.py +81 -0
- mlops/core/graph_types.py +64 -0
- mlops/core/networkx_parser.py +135 -0
- mlops/core/payload_spill.py +278 -0
- mlops/core/pipeline_utils.py +162 -0
- mlops/core/process_hashing.py +216 -0
- mlops/core/step_state_manager.py +1298 -0
- mlops/core/step_system.py +956 -0
- mlops/core/workspace.py +99 -0
- mlops/environment/__init__.py +10 -0
- mlops/environment/base.py +43 -0
- mlops/environment/conda_manager.py +307 -0
- mlops/environment/factory.py +70 -0
- mlops/environment/pyenv_manager.py +146 -0
- mlops/environment/setup_env.py +31 -0
- mlops/environment/system_manager.py +66 -0
- mlops/environment/utils.py +105 -0
- mlops/environment/venv_manager.py +134 -0
- mlops/main.py +527 -0
- mlops/managers/project_manager.py +400 -0
- mlops/managers/reproducibility_manager.py +575 -0
- mlops/platform.py +996 -0
- mlops/reporting/__init__.py +16 -0
- mlops/reporting/context.py +187 -0
- mlops/reporting/entrypoint.py +292 -0
- mlops/reporting/kv_utils.py +77 -0
- mlops/reporting/registry.py +50 -0
- mlops/runtime/__init__.py +9 -0
- mlops/runtime/context.py +34 -0
- mlops/runtime/env_export.py +113 -0
- mlops/storage/__init__.py +12 -0
- mlops/storage/adapters/__init__.py +9 -0
- mlops/storage/adapters/gcp_kv_store.py +778 -0
- mlops/storage/adapters/gcs_object_store.py +96 -0
- mlops/storage/adapters/memory_store.py +240 -0
- mlops/storage/adapters/redis_store.py +438 -0
- mlops/storage/factory.py +199 -0
- mlops/storage/interfaces/__init__.py +6 -0
- mlops/storage/interfaces/kv_store.py +118 -0
- mlops/storage/path_utils.py +38 -0
- mlops/templates/premier-league/charts/plot_metrics.js +70 -0
- mlops/templates/premier-league/charts/plot_metrics.py +145 -0
- mlops/templates/premier-league/charts/requirements.txt +6 -0
- mlops/templates/premier-league/configs/cluster_config.yaml +13 -0
- mlops/templates/premier-league/configs/project_config.yaml +207 -0
- mlops/templates/premier-league/data/England CSV.csv +12154 -0
- mlops/templates/premier-league/models/premier_league_model.py +638 -0
- mlops/templates/premier-league/requirements.txt +8 -0
- mlops/templates/sklearn-basic/README.md +22 -0
- mlops/templates/sklearn-basic/charts/plot_metrics.py +85 -0
- mlops/templates/sklearn-basic/charts/requirements.txt +3 -0
- mlops/templates/sklearn-basic/configs/project_config.yaml +64 -0
- mlops/templates/sklearn-basic/data/train.csv +14 -0
- mlops/templates/sklearn-basic/models/model.py +62 -0
- mlops/templates/sklearn-basic/requirements.txt +10 -0
- mlops/web/__init__.py +3 -0
- mlops/web/server.py +585 -0
- mlops/web/ui/index.html +52 -0
- mlops/web/ui/mlops-charts.js +357 -0
- mlops/web/ui/script.js +1244 -0
- mlops/web/ui/styles.css +248 -0
mlops/platform.py
ADDED
|
@@ -0,0 +1,996 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, Dict
|
|
5
|
+
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
import hashlib
|
|
8
|
+
import json
|
|
9
|
+
import logging
|
|
10
|
+
import os
|
|
11
|
+
import re
|
|
12
|
+
import subprocess
|
|
13
|
+
import sys
|
|
14
|
+
import time
|
|
15
|
+
import uuid
|
|
16
|
+
|
|
17
|
+
import yaml
|
|
18
|
+
|
|
19
|
+
from .adapters.plugin_manager import AdapterPluginManager
|
|
20
|
+
from .adapters.config_schema import AdapterConfig
|
|
21
|
+
from .managers.reproducibility_manager import ReproducibilityManager
|
|
22
|
+
from .managers.project_manager import ProjectManager
|
|
23
|
+
from .core.experiment_tracker import ExperimentTracker, NoOpExperimentTracker
|
|
24
|
+
from .core.workspace import get_projects_root, get_workspace_root, infer_source_root
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class MLPlatform:
|
|
28
|
+
"""Main platform class that orchestrates the ML pipeline."""
|
|
29
|
+
|
|
30
|
+
def __init__(self) -> None:
|
|
31
|
+
self.adapter_manager = AdapterPluginManager()
|
|
32
|
+
# Adapter discovery already tries both `mlops.*` and `src.mlops.*` layouts.
|
|
33
|
+
self.adapter_manager.discover_adapters("mlops.adapters")
|
|
34
|
+
|
|
35
|
+
# Track output directories of dynamic charts so we can upload artifacts later.
|
|
36
|
+
self._dynamic_chart_outputs: list[tuple[str, Path]] = []
|
|
37
|
+
|
|
38
|
+
def _initialize_tracker(self, platform_config: Dict[str, Any]) -> ExperimentTracker:
|
|
39
|
+
"""Initialize experiment tracker from configuration."""
|
|
40
|
+
logger = logging.getLogger(__name__)
|
|
41
|
+
tracking_config = platform_config.get("reproducibility", {}).get("experiment_tracking", {})
|
|
42
|
+
tracker_name = str(tracking_config.get("backend", "noop") or "noop").strip().lower()
|
|
43
|
+
tracker_params = tracking_config.get("parameters", {})
|
|
44
|
+
if not isinstance(tracker_params, dict):
|
|
45
|
+
tracker_params = {}
|
|
46
|
+
|
|
47
|
+
# NOTE: The platform's primary metrics path is `mlops.core.step_system.log_metric`
|
|
48
|
+
# (KV-store based). External experiment tracking backends are optional.
|
|
49
|
+
if tracker_name not in {"noop"}:
|
|
50
|
+
logger.warning(
|
|
51
|
+
f"Experiment tracker backend '{tracker_name}' is not available in this build. "
|
|
52
|
+
f"Falling back to NoOpExperimentTracker."
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
return NoOpExperimentTracker(config=tracker_params)
|
|
56
|
+
|
|
57
|
+
def _repo_root(self) -> Path:
|
|
58
|
+
# Legacy name; this is now the workspace root (where projects/ lives).
|
|
59
|
+
try:
|
|
60
|
+
return get_workspace_root()
|
|
61
|
+
except Exception:
|
|
62
|
+
return Path.cwd()
|
|
63
|
+
|
|
64
|
+
def _set_env_var(self, key: str, value: str) -> None:
|
|
65
|
+
"""Best-effort environment variable setter (never raises)."""
|
|
66
|
+
try:
|
|
67
|
+
os.environ[key] = value
|
|
68
|
+
except Exception:
|
|
69
|
+
pass
|
|
70
|
+
|
|
71
|
+
def _get_reporting_python_exec(self) -> str:
|
|
72
|
+
try:
|
|
73
|
+
return os.environ.get("MLOPS_REPORTING_PYTHON") or os.environ.get("MLOPS_RUNTIME_PYTHON") or sys.executable
|
|
74
|
+
except Exception:
|
|
75
|
+
return sys.executable
|
|
76
|
+
|
|
77
|
+
def _in_distributed_mode(self) -> bool:
|
|
78
|
+
return bool(os.environ.get("DASK_SCHEDULER_ADDRESS") or os.environ.get("MLOPS_CLUSTER_MODE"))
|
|
79
|
+
|
|
80
|
+
def _get_project_id_from_adapter(self, adapter: Any) -> str:
|
|
81
|
+
try:
|
|
82
|
+
ssm = getattr(adapter, 'step_state_manager', None)
|
|
83
|
+
kv = getattr(ssm, 'kv_store', None) if ssm else None
|
|
84
|
+
return (getattr(kv, 'project_id', None) if kv else None) or os.environ.get("MLOPS_PROJECT_ID") or ""
|
|
85
|
+
except Exception:
|
|
86
|
+
return os.environ.get("MLOPS_PROJECT_ID") or ""
|
|
87
|
+
|
|
88
|
+
def _get_project_dir_hint(self, project_id: str):
|
|
89
|
+
try:
|
|
90
|
+
pm = ProjectManager()
|
|
91
|
+
return pm.get_project_path(project_id)
|
|
92
|
+
except Exception:
|
|
93
|
+
return None
|
|
94
|
+
|
|
95
|
+
def _resolve_entrypoint_path(self, entrypoint: str, project_dir_hint: Path | None = None) -> Path | None:
|
|
96
|
+
try:
|
|
97
|
+
ep = Path(entrypoint)
|
|
98
|
+
if ep.is_absolute() and ep.exists():
|
|
99
|
+
return ep
|
|
100
|
+
if ep.exists():
|
|
101
|
+
return ep
|
|
102
|
+
if project_dir_hint:
|
|
103
|
+
cand = (Path(project_dir_hint) / entrypoint)
|
|
104
|
+
if cand.exists():
|
|
105
|
+
return cand
|
|
106
|
+
ws = self._repo_root()
|
|
107
|
+
cand = (ws / entrypoint)
|
|
108
|
+
return cand if cand.exists() else None
|
|
109
|
+
except Exception:
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
def _default_reporting_entrypoint_path(self) -> Path | None:
|
|
113
|
+
"""Return the built-in reporting entrypoint file path (inside the installed package)."""
|
|
114
|
+
try:
|
|
115
|
+
import mlops.reporting.entrypoint as _entry
|
|
116
|
+
p = Path(getattr(_entry, "__file__", "") or "")
|
|
117
|
+
return p if p.exists() else None
|
|
118
|
+
except Exception:
|
|
119
|
+
return None
|
|
120
|
+
|
|
121
|
+
def _maybe_apply_cache_env(self, env: dict, platform_config: Dict[str, Any], project_dir_hint) -> None:
|
|
122
|
+
cache_cfg = ((platform_config.get("model") or {}).get("parameters") or {}).get("cache") or {}
|
|
123
|
+
if not isinstance(cache_cfg, dict):
|
|
124
|
+
return
|
|
125
|
+
backend_cfg = cache_cfg.get("backend") or {}
|
|
126
|
+
if not isinstance(backend_cfg, dict):
|
|
127
|
+
return
|
|
128
|
+
|
|
129
|
+
gcp_project = backend_cfg.get("gcp_project")
|
|
130
|
+
if gcp_project:
|
|
131
|
+
env["GOOGLE_CLOUD_PROJECT"] = str(gcp_project)
|
|
132
|
+
emulator_host = backend_cfg.get("emulator_host")
|
|
133
|
+
if emulator_host:
|
|
134
|
+
env["FIRESTORE_EMULATOR_HOST"] = str(emulator_host)
|
|
135
|
+
creds_path = backend_cfg.get("credentials_json")
|
|
136
|
+
if creds_path:
|
|
137
|
+
try:
|
|
138
|
+
creds_path_val = str(creds_path)
|
|
139
|
+
if not Path(creds_path_val).is_absolute() and project_dir_hint:
|
|
140
|
+
creds_path_val = str(Path(project_dir_hint) / creds_path_val)
|
|
141
|
+
env["GOOGLE_APPLICATION_CREDENTIALS"] = creds_path_val
|
|
142
|
+
except Exception:
|
|
143
|
+
pass
|
|
144
|
+
|
|
145
|
+
def _ensure_repo_src_on_pythonpath(self, env: dict) -> None:
|
|
146
|
+
src_root = infer_source_root()
|
|
147
|
+
if not src_root:
|
|
148
|
+
return
|
|
149
|
+
repo_src = str(src_root / "src")
|
|
150
|
+
prev_pp = str(env.get("PYTHONPATH", "") or "")
|
|
151
|
+
if repo_src in prev_pp.split(":"):
|
|
152
|
+
return
|
|
153
|
+
env["PYTHONPATH"] = f"{repo_src}:{prev_pp}".rstrip(":")
|
|
154
|
+
|
|
155
|
+
def _compute_config_hash(self, config_content: Dict[str, Any]) -> str:
|
|
156
|
+
"""Compute a stable hash of the configuration content (excluding run_id)."""
|
|
157
|
+
logger = logging.getLogger(__name__)
|
|
158
|
+
try:
|
|
159
|
+
config_copy = dict(config_content)
|
|
160
|
+
config_copy.pop("run_id", None)
|
|
161
|
+
|
|
162
|
+
config_str = json.dumps(config_copy, sort_keys=True, default=str, separators=(",", ":"))
|
|
163
|
+
|
|
164
|
+
return hashlib.sha256(config_str.encode()).hexdigest()
|
|
165
|
+
except Exception as e:
|
|
166
|
+
logger.warning(f"Failed to compute config hash: {e}. Falling back to random UUID-based hash.")
|
|
167
|
+
return str(uuid.uuid4()).replace("-", "")[:16]
|
|
168
|
+
|
|
169
|
+
def _generate_run_id(self, platform_config: Dict[str, Any], project_id: str | None = None) -> str:
|
|
170
|
+
"""Generate or extract run ID from configuration."""
|
|
171
|
+
run_id_from_config = platform_config.get("run_id")
|
|
172
|
+
if run_id_from_config and run_id_from_config not in ["${RUN_ID:-auto-generated}", "auto-generated"]:
|
|
173
|
+
return str(run_id_from_config)
|
|
174
|
+
|
|
175
|
+
# Always add a unique suffix so every execution has a distinct run_id
|
|
176
|
+
unique_suffix = datetime.utcnow().strftime("%Y%m%d%H%M%S") + "-" + uuid.uuid4().hex[:8]
|
|
177
|
+
if project_id:
|
|
178
|
+
return f"project-{project_id}-{unique_suffix}"
|
|
179
|
+
return f"config-{unique_suffix}"
|
|
180
|
+
|
|
181
|
+
def _prepare_run_metadata(self, platform_config: Dict[str, Any], run_id: str) -> Dict[str, Any]:
|
|
182
|
+
"""Prepare run metadata for tracking."""
|
|
183
|
+
run_name = platform_config.get("metadata", {}).get("name", "ml-pipeline-run")
|
|
184
|
+
run_tags = platform_config.get("metadata", {}).get("tags", {})
|
|
185
|
+
if isinstance(run_tags, list):
|
|
186
|
+
run_tags = {tag: "true" for tag in run_tags}
|
|
187
|
+
|
|
188
|
+
return {
|
|
189
|
+
"run_name": f"{run_name}-{run_id[:8]}",
|
|
190
|
+
"run_id": run_id,
|
|
191
|
+
"tags": run_tags
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
def _execute_pipeline(self, adapter, platform_config: Dict[str, Any], run_id: str, tracker: ExperimentTracker) -> Dict[str, Any]:
|
|
195
|
+
"""Execute the ML pipeline using the specified adapter."""
|
|
196
|
+
logger = logging.getLogger(__name__)
|
|
197
|
+
print(f"[MLPlatform] Starting pipeline execution for run_id: {run_id}")
|
|
198
|
+
|
|
199
|
+
try:
|
|
200
|
+
self._preregister_chart_probe_paths(adapter, platform_config, run_id)
|
|
201
|
+
except Exception as e:
|
|
202
|
+
logger.warning(f"Failed to pre-register chart probe paths: {e}")
|
|
203
|
+
|
|
204
|
+
dynamic_chart_processes = []
|
|
205
|
+
try:
|
|
206
|
+
reporting_cfg = (platform_config or {}).get("reporting") or {}
|
|
207
|
+
dyn_entry = str(reporting_cfg.get("dynamic_entrypoint") or "").strip() if isinstance(reporting_cfg, dict) else ""
|
|
208
|
+
if dyn_entry:
|
|
209
|
+
dynamic_chart_processes = self._start_dynamic_charts(adapter, platform_config, run_id)
|
|
210
|
+
except Exception as _dynamic_e:
|
|
211
|
+
logger.warning(f"Failed to start dynamic charts: {_dynamic_e}")
|
|
212
|
+
|
|
213
|
+
data_sources = platform_config.get("data", {}).get("sources", {})
|
|
214
|
+
training_params = platform_config.get("training", {}).get("parameters", {})
|
|
215
|
+
|
|
216
|
+
training_path_cfg = data_sources.get("training") or {}
|
|
217
|
+
validation_path_cfg = data_sources.get("validation") or {}
|
|
218
|
+
|
|
219
|
+
training_data_path = Path(training_path_cfg["path"]) if isinstance(training_path_cfg, dict) and training_path_cfg.get("path") else None
|
|
220
|
+
validation_data_path = Path(validation_path_cfg["path"]) if isinstance(validation_path_cfg, dict) and validation_path_cfg.get("path") else None
|
|
221
|
+
|
|
222
|
+
adapter_kwargs = dict(training_params) if isinstance(training_params, dict) else {}
|
|
223
|
+
adapter_kwargs["data_paths"] = {}
|
|
224
|
+
if training_data_path:
|
|
225
|
+
adapter_kwargs["data_paths"]["training"] = training_data_path
|
|
226
|
+
if validation_data_path:
|
|
227
|
+
adapter_kwargs["data_paths"]["validation"] = validation_data_path
|
|
228
|
+
|
|
229
|
+
# Provide selected top-level sections to adapters/workers for cache hashing hooks.
|
|
230
|
+
full_cfg_hash = self._compute_config_hash(platform_config)
|
|
231
|
+
adapter_kwargs["global_config_overrides"] = {
|
|
232
|
+
"reproducibility": (platform_config.get("reproducibility", {}) or {}),
|
|
233
|
+
"data": (platform_config.get("data", {}) or {}),
|
|
234
|
+
"project_config_file_hash": full_cfg_hash,
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
# Give adapters a chance to attach the tracker instance if they support it.
|
|
238
|
+
try:
|
|
239
|
+
if hasattr(adapter, "set_tracker"):
|
|
240
|
+
adapter.set_tracker(tracker)
|
|
241
|
+
except Exception:
|
|
242
|
+
pass
|
|
243
|
+
|
|
244
|
+
# Pass run_id + tracker down so adapters can keep storage/tracking consistent.
|
|
245
|
+
pipeline_results = adapter.run(
|
|
246
|
+
data_paths=adapter_kwargs.get("data_paths"),
|
|
247
|
+
run_id=run_id,
|
|
248
|
+
tracker=tracker,
|
|
249
|
+
**{k: v for k, v in adapter_kwargs.items() if k != "data_paths"}
|
|
250
|
+
)
|
|
251
|
+
print(f"[MLPlatform] Pipeline completed")
|
|
252
|
+
|
|
253
|
+
try:
|
|
254
|
+
if dynamic_chart_processes:
|
|
255
|
+
is_distributed = self._in_distributed_mode()
|
|
256
|
+
if is_distributed:
|
|
257
|
+
logger.info(f"Dynamic chart jobs submitted to cluster: {', '.join(dynamic_chart_processes)}")
|
|
258
|
+
else:
|
|
259
|
+
pids = ", ".join([str(p.pid) for p in dynamic_chart_processes if getattr(p, "pid", None)])
|
|
260
|
+
logger.info(f"Dynamic chart(s) running in background (PIDs: {pids})")
|
|
261
|
+
|
|
262
|
+
# Best-effort: upload artifacts produced by dynamic charts (async)
|
|
263
|
+
try:
|
|
264
|
+
self._upload_dynamic_chart_artifacts_async(adapter, run_id)
|
|
265
|
+
except Exception as _dyn_up_e:
|
|
266
|
+
logger.warning(f"Dynamic chart artifact upload failed: {_dyn_up_e}")
|
|
267
|
+
except Exception as _report_e:
|
|
268
|
+
logger.warning(f"Reporting failed: {_report_e}")
|
|
269
|
+
|
|
270
|
+
return {
|
|
271
|
+
"run_id": run_id,
|
|
272
|
+
"pipeline_results": pipeline_results
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
def _preregister_chart_probe_paths(self, adapter: Any, platform_config: Dict[str, Any], run_id: str) -> None:
|
|
276
|
+
"""No-op retained for backwards compatibility."""
|
|
277
|
+
logging.getLogger(__name__).debug("Probe path pre-registration is a no-op (path-based metrics).")
|
|
278
|
+
|
|
279
|
+
def _get_reporting_cfg(self, platform_config: Dict[str, Any]) -> Dict[str, Any]:
|
|
280
|
+
cfg = (platform_config or {}).get("reporting") or {}
|
|
281
|
+
return cfg if isinstance(cfg, dict) else {}
|
|
282
|
+
|
|
283
|
+
def _get_reporting_entrypoint(self, reporting_cfg: Dict[str, Any]) -> str:
|
|
284
|
+
return str(reporting_cfg.get("static_entrypoint") or reporting_cfg.get("entrypoint") or "").strip()
|
|
285
|
+
|
|
286
|
+
def _get_dynamic_chart_specs(self, reporting_cfg: Dict[str, Any]) -> list[dict]:
|
|
287
|
+
charts = reporting_cfg.get("charts") or []
|
|
288
|
+
if not isinstance(charts, list):
|
|
289
|
+
return []
|
|
290
|
+
out: list[dict] = []
|
|
291
|
+
for c in charts:
|
|
292
|
+
if isinstance(c, dict) and str(c.get("type", "")).lower() == "dynamic":
|
|
293
|
+
out.append(c)
|
|
294
|
+
return out
|
|
295
|
+
|
|
296
|
+
def _resolve_reporting_entry_to_run(self, configured_entry: Path) -> tuple[Path, str | None]:
|
|
297
|
+
"""Resolve the actual script/module to run for reporting.
|
|
298
|
+
|
|
299
|
+
If config points to a user script, run the framework entrypoint and import the user file.
|
|
300
|
+
"""
|
|
301
|
+
default_entry = self._default_reporting_entrypoint_path()
|
|
302
|
+
if default_entry is None:
|
|
303
|
+
return configured_entry, None
|
|
304
|
+
try:
|
|
305
|
+
if configured_entry.resolve() != default_entry.resolve():
|
|
306
|
+
return default_entry, str(configured_entry)
|
|
307
|
+
except Exception:
|
|
308
|
+
pass
|
|
309
|
+
return default_entry, None
|
|
310
|
+
|
|
311
|
+
def _start_dynamic_charts(self, adapter: Any, platform_config: Dict[str, Any], run_id: str) -> list:
|
|
312
|
+
"""Start dynamic charts as background processes (local) or cluster jobs (distributed).
|
|
313
|
+
|
|
314
|
+
Returns list of subprocess.Popen objects (local) or job IDs (distributed) for tracking.
|
|
315
|
+
"""
|
|
316
|
+
logger = logging.getLogger(__name__)
|
|
317
|
+
|
|
318
|
+
# Detect if we're in cluster/distributed mode
|
|
319
|
+
is_distributed = self._in_distributed_mode()
|
|
320
|
+
|
|
321
|
+
if is_distributed:
|
|
322
|
+
logger.info("Detected distributed mode - will submit dynamic charts as cluster jobs")
|
|
323
|
+
return self._start_dynamic_charts_distributed(adapter, platform_config, run_id)
|
|
324
|
+
else:
|
|
325
|
+
logger.info("Local mode - will run dynamic charts as background processes")
|
|
326
|
+
return self._start_dynamic_charts_local(adapter, platform_config, run_id)
|
|
327
|
+
|
|
328
|
+
def _start_dynamic_charts_local(self, adapter: Any, platform_config: Dict[str, Any], run_id: str) -> list:
|
|
329
|
+
"""Start dynamic charts as local background processes.
|
|
330
|
+
|
|
331
|
+
Returns list of subprocess.Popen objects for the started dynamic chart processes.
|
|
332
|
+
"""
|
|
333
|
+
logger = logging.getLogger(__name__)
|
|
334
|
+
|
|
335
|
+
project_id = self._get_project_id_from_adapter(adapter)
|
|
336
|
+
|
|
337
|
+
reporting_cfg = self._get_reporting_cfg(platform_config)
|
|
338
|
+
entrypoint = self._get_reporting_entrypoint(reporting_cfg)
|
|
339
|
+
if not entrypoint:
|
|
340
|
+
return []
|
|
341
|
+
|
|
342
|
+
args = list(reporting_cfg.get("args") or [])
|
|
343
|
+
|
|
344
|
+
dynamic_charts = self._get_dynamic_chart_specs(reporting_cfg)
|
|
345
|
+
if not dynamic_charts:
|
|
346
|
+
return []
|
|
347
|
+
|
|
348
|
+
reporting_python = self._get_reporting_python_exec()
|
|
349
|
+
|
|
350
|
+
project_dir_hint = self._get_project_dir_hint(project_id)
|
|
351
|
+
entry = self._resolve_entrypoint_path(entrypoint, project_dir_hint=project_dir_hint)
|
|
352
|
+
if not entry:
|
|
353
|
+
logger.warning(f"Reporting entrypoint not found: {entrypoint}")
|
|
354
|
+
return []
|
|
355
|
+
|
|
356
|
+
# Output under the project artifacts directory
|
|
357
|
+
try:
|
|
358
|
+
if project_dir_hint:
|
|
359
|
+
output_base = Path(project_dir_hint) / "artifacts" / "charts" / run_id
|
|
360
|
+
else:
|
|
361
|
+
output_base = get_projects_root(self._repo_root()) / project_id / "artifacts" / "charts" / run_id
|
|
362
|
+
except Exception:
|
|
363
|
+
output_base = Path.cwd() / "projects" / project_id / "artifacts" / "charts" / run_id
|
|
364
|
+
|
|
365
|
+
default_entry = self._default_reporting_entrypoint_path()
|
|
366
|
+
|
|
367
|
+
dynamic_processes = []
|
|
368
|
+
|
|
369
|
+
for spec in dynamic_charts:
|
|
370
|
+
name = str(spec.get("name") or "dynamic_chart").strip()
|
|
371
|
+
chart_out = output_base / name / time.strftime("%Y%m%d_%H%M%S")
|
|
372
|
+
|
|
373
|
+
try:
|
|
374
|
+
chart_out.mkdir(parents=True, exist_ok=True)
|
|
375
|
+
except Exception:
|
|
376
|
+
pass
|
|
377
|
+
# Track for later upload
|
|
378
|
+
try:
|
|
379
|
+
self._dynamic_chart_outputs.append((name, chart_out))
|
|
380
|
+
except Exception:
|
|
381
|
+
pass
|
|
382
|
+
|
|
383
|
+
env = os.environ.copy()
|
|
384
|
+
applied_run_env = False
|
|
385
|
+
try:
|
|
386
|
+
# Centralized env export for process boundaries (best-effort).
|
|
387
|
+
from mlops.runtime.env_export import export_run_env
|
|
388
|
+
|
|
389
|
+
rc = getattr(adapter, "run_context", None)
|
|
390
|
+
if rc is not None:
|
|
391
|
+
env.update(export_run_env(rc))
|
|
392
|
+
applied_run_env = True
|
|
393
|
+
except Exception:
|
|
394
|
+
applied_run_env = False
|
|
395
|
+
if project_id:
|
|
396
|
+
env["MLOPS_PROJECT_ID"] = project_id
|
|
397
|
+
env["MLOPS_OUTPUT_DIR"] = str(chart_out)
|
|
398
|
+
env["MLOPS_CHART_NAME"] = name
|
|
399
|
+
env["MLOPS_RUN_ID"] = run_id
|
|
400
|
+
env["MLOPS_CHART_TYPE"] = "dynamic"
|
|
401
|
+
|
|
402
|
+
if not applied_run_env:
|
|
403
|
+
self._maybe_apply_cache_env(env, platform_config, project_dir_hint)
|
|
404
|
+
|
|
405
|
+
if "probe_paths" in spec:
|
|
406
|
+
try:
|
|
407
|
+
env["MLOPS_PROBE_PATHS"] = json.dumps(spec.get("probe_paths"))
|
|
408
|
+
except Exception:
|
|
409
|
+
pass
|
|
410
|
+
|
|
411
|
+
entry_to_run, import_file = self._resolve_reporting_entry_to_run(entry)
|
|
412
|
+
try:
|
|
413
|
+
if import_file:
|
|
414
|
+
env["MLOPS_CHART_IMPORT_FILES"] = import_file
|
|
415
|
+
except Exception:
|
|
416
|
+
pass
|
|
417
|
+
|
|
418
|
+
try:
|
|
419
|
+
self._ensure_repo_src_on_pythonpath(env)
|
|
420
|
+
except Exception as _path_e:
|
|
421
|
+
logger.warning(f"Failed to set PYTHONPATH for dynamic chart '{name}': {_path_e}")
|
|
422
|
+
|
|
423
|
+
spec_args = list(spec.get("args") or [])
|
|
424
|
+
run_as_module = bool(default_entry and entry_to_run and entry_to_run.resolve() == default_entry.resolve())
|
|
425
|
+
if run_as_module:
|
|
426
|
+
cmd = [reporting_python, "-u", "-m", "mlops.reporting.entrypoint"] + args + spec_args
|
|
427
|
+
else:
|
|
428
|
+
cmd = [reporting_python, "-u", str(entry_to_run)] + args + spec_args
|
|
429
|
+
|
|
430
|
+
try:
|
|
431
|
+
logger.info(f"Starting dynamic chart '{name}' in background -> {chart_out}")
|
|
432
|
+
stdout_log = chart_out / "stdout.log"
|
|
433
|
+
stderr_log = chart_out / "stderr.log"
|
|
434
|
+
|
|
435
|
+
stdout_file = open(stdout_log, "a", buffering=1)
|
|
436
|
+
stderr_file = open(stderr_log, "a", buffering=1)
|
|
437
|
+
try:
|
|
438
|
+
proc = subprocess.Popen(cmd, env=env, stdout=stdout_file, stderr=stderr_file, cwd=str(self._repo_root()))
|
|
439
|
+
finally:
|
|
440
|
+
# Close in parent; child keeps its own fds.
|
|
441
|
+
try:
|
|
442
|
+
stdout_file.close()
|
|
443
|
+
except Exception:
|
|
444
|
+
pass
|
|
445
|
+
try:
|
|
446
|
+
stderr_file.close()
|
|
447
|
+
except Exception:
|
|
448
|
+
pass
|
|
449
|
+
dynamic_processes.append(proc)
|
|
450
|
+
logger.info(f"Dynamic chart '{name}' started with PID {proc.pid}, logs: stdout={stdout_log}, stderr={stderr_log}")
|
|
451
|
+
except Exception as _e:
|
|
452
|
+
logger.warning(f"Failed to start dynamic chart '{name}': {_e}")
|
|
453
|
+
|
|
454
|
+
if dynamic_processes:
|
|
455
|
+
logger.info(f"Started {len(dynamic_processes)} dynamic chart(s) in background")
|
|
456
|
+
|
|
457
|
+
return dynamic_processes
|
|
458
|
+
|
|
459
|
+
def _start_dynamic_charts_distributed(self, adapter: Any, platform_config: Dict[str, Any], run_id: str) -> list:
|
|
460
|
+
"""Start dynamic charts via the configured cluster provider.
|
|
461
|
+
|
|
462
|
+
Returns list of provider-specific job identifiers (strings)."""
|
|
463
|
+
provider = self._get_cluster_provider_name(adapter)
|
|
464
|
+
if provider == "slurm":
|
|
465
|
+
return self._start_dynamic_charts_distributed_slurm(adapter, platform_config, run_id)
|
|
466
|
+
elif provider == "ansible":
|
|
467
|
+
# Run on head node as a fallback; return labels as job ids
|
|
468
|
+
procs = self._start_dynamic_charts_local(adapter, platform_config, run_id)
|
|
469
|
+
return [f"pid-{getattr(p, 'pid', 'unknown')}" for p in (procs or [])]
|
|
470
|
+
else:
|
|
471
|
+
logging.getLogger(__name__).warning(f"Unknown cluster provider '{provider}'. Falling back to local dynamic charts.")
|
|
472
|
+
procs = self._start_dynamic_charts_local(adapter, platform_config, run_id)
|
|
473
|
+
return [f"pid-{getattr(p, 'pid', 'unknown')}" for p in (procs or [])]
|
|
474
|
+
|
|
475
|
+
def _get_cluster_provider_name(self, adapter: Any) -> str:
|
|
476
|
+
"""Determine the cluster provider from env or project cluster_config.yaml."""
|
|
477
|
+
try:
|
|
478
|
+
prov = os.environ.get("MLOPS_CLUSTER_PROVIDER")
|
|
479
|
+
if isinstance(prov, str) and prov.strip():
|
|
480
|
+
return prov.strip().lower()
|
|
481
|
+
except Exception:
|
|
482
|
+
pass
|
|
483
|
+
# Try project cluster_config.yaml
|
|
484
|
+
project_id = self._get_project_id_from_adapter(adapter)
|
|
485
|
+
project_dir_hint = self._get_project_dir_hint(project_id)
|
|
486
|
+
if project_dir_hint:
|
|
487
|
+
cfg_path = Path(project_dir_hint) / "configs" / "cluster_config.yaml"
|
|
488
|
+
else:
|
|
489
|
+
cfg_path = get_projects_root(self._repo_root()) / project_id / "configs" / "cluster_config.yaml"
|
|
490
|
+
try:
|
|
491
|
+
if cfg_path.exists():
|
|
492
|
+
with open(cfg_path) as f:
|
|
493
|
+
data = yaml.safe_load(f) or {}
|
|
494
|
+
provider = data.get("provider")
|
|
495
|
+
if isinstance(provider, str) and provider.strip():
|
|
496
|
+
return provider.strip().lower()
|
|
497
|
+
except Exception:
|
|
498
|
+
pass
|
|
499
|
+
return "slurm"
|
|
500
|
+
|
|
501
|
+
def _start_dynamic_charts_distributed_slurm(self, adapter: Any, platform_config: Dict[str, Any], run_id: str) -> list:
|
|
502
|
+
"""Start dynamic charts by submitting SLURM sbatch jobs."""
|
|
503
|
+
logger = logging.getLogger(__name__)
|
|
504
|
+
|
|
505
|
+
project_id = self._get_project_id_from_adapter(adapter)
|
|
506
|
+
|
|
507
|
+
reporting_cfg = self._get_reporting_cfg(platform_config)
|
|
508
|
+
entrypoint = self._get_reporting_entrypoint(reporting_cfg)
|
|
509
|
+
if not entrypoint:
|
|
510
|
+
return []
|
|
511
|
+
|
|
512
|
+
args = list(reporting_cfg.get("args") or [])
|
|
513
|
+
dynamic_charts = self._get_dynamic_chart_specs(reporting_cfg)
|
|
514
|
+
if not dynamic_charts:
|
|
515
|
+
return []
|
|
516
|
+
|
|
517
|
+
reporting_python = self._get_reporting_python_exec()
|
|
518
|
+
project_dir_hint = self._get_project_dir_hint(project_id)
|
|
519
|
+
entry = self._resolve_entrypoint_path(entrypoint, project_dir_hint=project_dir_hint)
|
|
520
|
+
if not entry:
|
|
521
|
+
logger.warning(f"Reporting entrypoint not found: {entrypoint}")
|
|
522
|
+
return []
|
|
523
|
+
|
|
524
|
+
# Output under the project artifacts directory
|
|
525
|
+
try:
|
|
526
|
+
if project_dir_hint:
|
|
527
|
+
output_base = Path(project_dir_hint) / "artifacts" / "charts" / run_id
|
|
528
|
+
else:
|
|
529
|
+
output_base = get_projects_root(self._repo_root()) / project_id / "artifacts" / "charts" / run_id
|
|
530
|
+
except Exception:
|
|
531
|
+
output_base = Path.cwd() / "projects" / project_id / "artifacts" / "charts" / run_id
|
|
532
|
+
|
|
533
|
+
default_entry = self._default_reporting_entrypoint_path()
|
|
534
|
+
entry_to_run, import_file = self._resolve_reporting_entry_to_run(entry)
|
|
535
|
+
run_as_module = bool(default_entry and entry_to_run and entry_to_run.resolve() == default_entry.resolve())
|
|
536
|
+
|
|
537
|
+
job_ids: list[str] = []
|
|
538
|
+
for spec in dynamic_charts:
|
|
539
|
+
name = str(spec.get("name") or "dynamic_chart").strip()
|
|
540
|
+
chart_out = output_base / name / time.strftime("%Y%m%d_%H%M%S")
|
|
541
|
+
try:
|
|
542
|
+
chart_out.mkdir(parents=True, exist_ok=True)
|
|
543
|
+
except Exception:
|
|
544
|
+
pass
|
|
545
|
+
try:
|
|
546
|
+
self._dynamic_chart_outputs.append((name, chart_out))
|
|
547
|
+
except Exception:
|
|
548
|
+
pass
|
|
549
|
+
|
|
550
|
+
env_vars = []
|
|
551
|
+
if project_id:
|
|
552
|
+
env_vars.append(f"MLOPS_PROJECT_ID={project_id}")
|
|
553
|
+
try:
|
|
554
|
+
env_vars.append(f"MLOPS_WORKSPACE_DIR={self._repo_root()}")
|
|
555
|
+
except Exception:
|
|
556
|
+
pass
|
|
557
|
+
env_vars.append(f"MLOPS_OUTPUT_DIR={chart_out}")
|
|
558
|
+
env_vars.append(f"MLOPS_CHART_NAME={name}")
|
|
559
|
+
env_vars.append(f"MLOPS_RUN_ID={run_id}")
|
|
560
|
+
env_vars.append(f"MLOPS_CHART_TYPE=dynamic")
|
|
561
|
+
|
|
562
|
+
# Add import file if using user script
|
|
563
|
+
if import_file:
|
|
564
|
+
env_vars.append(f"MLOPS_CHART_IMPORT_FILES={import_file}")
|
|
565
|
+
|
|
566
|
+
tmp_env = {}
|
|
567
|
+
self._maybe_apply_cache_env(tmp_env, platform_config, project_dir_hint)
|
|
568
|
+
for k, v in tmp_env.items():
|
|
569
|
+
env_vars.append(f"{k}={v}")
|
|
570
|
+
|
|
571
|
+
if "probe_paths" in spec:
|
|
572
|
+
try:
|
|
573
|
+
probe_paths_json = json.dumps(spec.get("probe_paths")).replace('"', '\\"')
|
|
574
|
+
env_vars.append(f'MLOPS_PROBE_PATHS="{probe_paths_json}"')
|
|
575
|
+
except Exception:
|
|
576
|
+
pass
|
|
577
|
+
|
|
578
|
+
try:
|
|
579
|
+
src_root = infer_source_root()
|
|
580
|
+
if src_root and (src_root / "src").exists():
|
|
581
|
+
repo_src = str(src_root / "src")
|
|
582
|
+
env_vars.append(f"PYTHONPATH={repo_src}:$PYTHONPATH")
|
|
583
|
+
except Exception:
|
|
584
|
+
pass
|
|
585
|
+
|
|
586
|
+
chart_cmd_args = args + list(spec.get("args") or [])
|
|
587
|
+
|
|
588
|
+
if run_as_module:
|
|
589
|
+
chart_cmd = f"{reporting_python} -u -m mlops.reporting.entrypoint {' '.join(chart_cmd_args)}"
|
|
590
|
+
else:
|
|
591
|
+
chart_cmd = f"{reporting_python} -u {entry_to_run} {' '.join(chart_cmd_args)}"
|
|
592
|
+
|
|
593
|
+
sbatch_script = f"""#!/bin/bash
|
|
594
|
+
#SBATCH --job-name={name}
|
|
595
|
+
#SBATCH --output={chart_out}/slurm-%j.out
|
|
596
|
+
#SBATCH --error={chart_out}/slurm-%j.err
|
|
597
|
+
#SBATCH --time=01:00:00
|
|
598
|
+
#SBATCH --ntasks=1
|
|
599
|
+
#SBATCH --cpus-per-task=1
|
|
600
|
+
#SBATCH --mem=4G
|
|
601
|
+
|
|
602
|
+
# Export environment variables
|
|
603
|
+
{chr(10).join(f'export {var}' for var in env_vars)}
|
|
604
|
+
|
|
605
|
+
# Run the chart
|
|
606
|
+
{chart_cmd}
|
|
607
|
+
"""
|
|
608
|
+
|
|
609
|
+
script_path = chart_out / f"submit_{name}.sh"
|
|
610
|
+
try:
|
|
611
|
+
script_path.write_text(sbatch_script)
|
|
612
|
+
script_path.chmod(0o755)
|
|
613
|
+
except Exception as e:
|
|
614
|
+
logger.warning(f"Failed to write sbatch script for '{name}': {e}")
|
|
615
|
+
continue
|
|
616
|
+
|
|
617
|
+
try:
|
|
618
|
+
logger.info(f"Submitting dynamic chart '{name}' to cluster -> {chart_out}")
|
|
619
|
+
result = subprocess.run(["sbatch", str(script_path)], capture_output=True, text=True, check=True)
|
|
620
|
+
output = result.stdout.strip()
|
|
621
|
+
if "Submitted batch job" in output:
|
|
622
|
+
job_id = output.split()[-1]
|
|
623
|
+
job_ids.append(job_id)
|
|
624
|
+
logger.info(f"Dynamic chart '{name}' submitted as job {job_id}")
|
|
625
|
+
else:
|
|
626
|
+
logger.warning(f"Unexpected sbatch output for '{name}': {output}")
|
|
627
|
+
except Exception as e:
|
|
628
|
+
logger.warning(f"Failed to submit dynamic chart '{name}': {e}")
|
|
629
|
+
|
|
630
|
+
if job_ids:
|
|
631
|
+
logger.info(f"Submitted {len(job_ids)} dynamic chart(s) to cluster: {', '.join(job_ids)}")
|
|
632
|
+
return job_ids
|
|
633
|
+
|
|
634
|
+
def _upload_dynamic_chart_artifacts_async(self, adapter: Any, run_id: str) -> None:
|
|
635
|
+
"""Upload PNG artifacts from dynamic chart output directories asynchronously.
|
|
636
|
+
|
|
637
|
+
This runs in a background thread to avoid blocking the main execution flow.
|
|
638
|
+
"""
|
|
639
|
+
import threading
|
|
640
|
+
logger = logging.getLogger(__name__)
|
|
641
|
+
|
|
642
|
+
def _upload_worker():
|
|
643
|
+
try:
|
|
644
|
+
self._upload_dynamic_chart_artifacts(adapter, run_id)
|
|
645
|
+
except Exception as e:
|
|
646
|
+
logger.warning(f"Async dynamic chart upload failed: {e}")
|
|
647
|
+
|
|
648
|
+
# Start upload in background thread
|
|
649
|
+
upload_thread = threading.Thread(target=_upload_worker, daemon=True)
|
|
650
|
+
upload_thread.start()
|
|
651
|
+
logger.info("Dynamic chart artifact upload started in background")
|
|
652
|
+
|
|
653
|
+
def _upload_dynamic_chart_artifacts(self, adapter: Any, run_id: str) -> None:
|
|
654
|
+
"""Upload PNG artifacts from dynamic chart output directories and record them in KV.
|
|
655
|
+
|
|
656
|
+
Best-effort: skips silently if directories are missing or object store is unavailable.
|
|
657
|
+
"""
|
|
658
|
+
logger = logging.getLogger(__name__)
|
|
659
|
+
try:
|
|
660
|
+
tracked: list = getattr(self, "_dynamic_chart_outputs", []) or []
|
|
661
|
+
except Exception:
|
|
662
|
+
tracked = []
|
|
663
|
+
if not tracked:
|
|
664
|
+
return
|
|
665
|
+
uploaded_any = False
|
|
666
|
+
for item in tracked:
|
|
667
|
+
try:
|
|
668
|
+
name, chart_out = item
|
|
669
|
+
except Exception:
|
|
670
|
+
continue
|
|
671
|
+
try:
|
|
672
|
+
# Ensure path is a Path
|
|
673
|
+
chart_out = Path(chart_out)
|
|
674
|
+
except Exception:
|
|
675
|
+
continue
|
|
676
|
+
if not chart_out.exists():
|
|
677
|
+
# Skip missing dirs (e.g., remote-only paths)
|
|
678
|
+
continue
|
|
679
|
+
try:
|
|
680
|
+
self._upload_single_chart_artifacts(adapter, run_id, name, chart_out, chart_type="dynamic")
|
|
681
|
+
uploaded_any = True
|
|
682
|
+
except Exception as _e:
|
|
683
|
+
logger.warning(f"Dynamic chart upload failed for '{name}': {_e}")
|
|
684
|
+
if uploaded_any:
|
|
685
|
+
logger.info("Uploaded dynamic chart artifacts and recorded in KV store")
|
|
686
|
+
|
|
687
|
+
def _upload_single_chart_artifacts(self, adapter: Any, run_id: str, name: str, chart_out: Path, chart_type: str | None = None) -> None:
|
|
688
|
+
"""Upload PNG artifacts for a single chart and record them in KV.
|
|
689
|
+
|
|
690
|
+
chart_type: optional "static" or "dynamic" for UI differentiation.
|
|
691
|
+
"""
|
|
692
|
+
logger = logging.getLogger(__name__)
|
|
693
|
+
ssm = getattr(adapter, 'step_state_manager', None)
|
|
694
|
+
obj_store = getattr(ssm, 'object_store', None) if ssm else None
|
|
695
|
+
kv = getattr(ssm, 'kv_store', None) if ssm else None
|
|
696
|
+
try:
|
|
697
|
+
project_ns = os.environ.get('MLOPS_PROJECT_ID', '')
|
|
698
|
+
except Exception:
|
|
699
|
+
project_ns = ''
|
|
700
|
+
# If bucket known, build absolute gs:// directory prefix for improved UX
|
|
701
|
+
abs_charts_root = None
|
|
702
|
+
try:
|
|
703
|
+
if obj_store and hasattr(obj_store, '_bucket') and getattr(obj_store, '_bucket') is not None:
|
|
704
|
+
bname = getattr(getattr(obj_store, '_bucket'), 'name', None)
|
|
705
|
+
if bname:
|
|
706
|
+
abs_charts_root = f"gs://{bname}/projects/{project_ns}/charts/{run_id}"
|
|
707
|
+
except Exception:
|
|
708
|
+
abs_charts_root = None
|
|
709
|
+
import time as _time
|
|
710
|
+
artifacts: list[dict] = []
|
|
711
|
+
# Capture PNGs recursively to support nested structures under chart output
|
|
712
|
+
for p in chart_out.rglob("*.png"):
|
|
713
|
+
obj_path = None
|
|
714
|
+
# Always capture local cache path so UI can fetch from server if object store is unavailable
|
|
715
|
+
try:
|
|
716
|
+
local_path = str(p.resolve())
|
|
717
|
+
except Exception:
|
|
718
|
+
local_path = str(p)
|
|
719
|
+
if obj_store:
|
|
720
|
+
try:
|
|
721
|
+
base = f"projects/{project_ns}/charts/{run_id}/{name}"
|
|
722
|
+
if abs_charts_root:
|
|
723
|
+
base = f"{abs_charts_root}/{name}"
|
|
724
|
+
remote = obj_store.build_uri(base, p.name)
|
|
725
|
+
with open(p, 'rb') as f:
|
|
726
|
+
obj_store.put_bytes(remote, f.read(), content_type="image/png")
|
|
727
|
+
obj_path = remote
|
|
728
|
+
except Exception as _ue:
|
|
729
|
+
logger.warning(f"Upload failed for chart '{name}' file {p.name}: {_ue}")
|
|
730
|
+
if not obj_path:
|
|
731
|
+
# Fallback to local cache path if no remote object path
|
|
732
|
+
obj_path = local_path
|
|
733
|
+
try:
|
|
734
|
+
artifacts.append({
|
|
735
|
+
"title": p.name,
|
|
736
|
+
"object_path": obj_path,
|
|
737
|
+
"cache_path": local_path,
|
|
738
|
+
"mime_type": "image/png",
|
|
739
|
+
"size_bytes": p.stat().st_size,
|
|
740
|
+
"created_at": _time.time(),
|
|
741
|
+
"chart_type": (chart_type or "static"),
|
|
742
|
+
})
|
|
743
|
+
except Exception:
|
|
744
|
+
pass
|
|
745
|
+
# Record artifacts in KV store for UI listing
|
|
746
|
+
try:
|
|
747
|
+
if kv and hasattr(kv, 'record_run_chart_artifacts'):
|
|
748
|
+
kv.record_run_chart_artifacts(run_id, name, artifacts)
|
|
749
|
+
except Exception:
|
|
750
|
+
pass
|
|
751
|
+
|
|
752
|
+
|
|
753
|
+
# -------------------- Cloud bootstrap helpers --------------------
|
|
754
|
+
|
|
755
|
+
def _configure_logging(self, config: Dict[str, Any], project_path: Path) -> None:
|
|
756
|
+
"""Configure Python logging system based on the config.
|
|
757
|
+
|
|
758
|
+
Honors MLOPS_RUN_LOG_FILE env var to force a unique per-run log file.
|
|
759
|
+
"""
|
|
760
|
+
# Prefer explicit per-run log path if it is timestamped; otherwise, create a timestamped file.
|
|
761
|
+
env_log = os.environ.get("MLOPS_RUN_LOG_FILE")
|
|
762
|
+
chosen_log_file: str
|
|
763
|
+
if env_log:
|
|
764
|
+
try:
|
|
765
|
+
name = Path(env_log).name
|
|
766
|
+
pattern = rf"^{re.escape(project_path.name)}_\d{{8}}_\d{{6}}\.log$"
|
|
767
|
+
if re.match(pattern, name):
|
|
768
|
+
chosen_log_file = env_log
|
|
769
|
+
else:
|
|
770
|
+
raise ValueError("Non-timestamped log path provided; overriding with timestamped path")
|
|
771
|
+
except Exception:
|
|
772
|
+
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
773
|
+
chosen_log_file = str(project_path / "logs" / f"{project_path.name}_{ts}.log")
|
|
774
|
+
self._set_env_var("MLOPS_RUN_LOG_FILE", chosen_log_file)
|
|
775
|
+
else:
|
|
776
|
+
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
777
|
+
chosen_log_file = str(project_path / "logs" / f"{project_path.name}_{ts}.log")
|
|
778
|
+
self._set_env_var("MLOPS_RUN_LOG_FILE", chosen_log_file)
|
|
779
|
+
log_path = Path(chosen_log_file)
|
|
780
|
+
log_path.parent.mkdir(parents=True, exist_ok=True)
|
|
781
|
+
|
|
782
|
+
handlers = [logging.FileHandler(str(log_path), encoding="utf-8")]
|
|
783
|
+
logging.basicConfig(
|
|
784
|
+
level=logging.INFO,
|
|
785
|
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
786
|
+
handlers=handlers,
|
|
787
|
+
force=True
|
|
788
|
+
)
|
|
789
|
+
|
|
790
|
+
# Redirect stdout/stderr prints to the logging system so nothing goes to the terminal
|
|
791
|
+
class _StreamToLogger:
|
|
792
|
+
def __init__(self, logger: logging.Logger, level: int):
|
|
793
|
+
self.logger = logger
|
|
794
|
+
self.level = level
|
|
795
|
+
self._buffer = ""
|
|
796
|
+
|
|
797
|
+
def write(self, message: str) -> None:
|
|
798
|
+
if not message:
|
|
799
|
+
return
|
|
800
|
+
self._buffer += message
|
|
801
|
+
while "\n" in self._buffer:
|
|
802
|
+
line, self._buffer = self._buffer.split("\n", 1)
|
|
803
|
+
if line.strip():
|
|
804
|
+
self.logger.log(self.level, line)
|
|
805
|
+
|
|
806
|
+
def flush(self) -> None:
|
|
807
|
+
if self._buffer.strip():
|
|
808
|
+
self.logger.log(self.level, self._buffer.strip())
|
|
809
|
+
self._buffer = ""
|
|
810
|
+
|
|
811
|
+
root_logger = logging.getLogger()
|
|
812
|
+
sys.stdout = _StreamToLogger(root_logger, logging.INFO)
|
|
813
|
+
sys.stderr = _StreamToLogger(root_logger, logging.ERROR)
|
|
814
|
+
|
|
815
|
+
def run_pipeline_for_project(self, project_id: str, config_path: str) -> Dict[str, Any]:
|
|
816
|
+
"""
|
|
817
|
+
Run the ML pipeline for a specific project.
|
|
818
|
+
|
|
819
|
+
Args:
|
|
820
|
+
project_id: The project identifier
|
|
821
|
+
config_path: Path to the project's configuration file
|
|
822
|
+
|
|
823
|
+
Returns:
|
|
824
|
+
Pipeline execution results
|
|
825
|
+
"""
|
|
826
|
+
project_manager = ProjectManager()
|
|
827
|
+
|
|
828
|
+
if not project_manager.project_exists(project_id):
|
|
829
|
+
raise ValueError(f"Project '{project_id}' does not exist")
|
|
830
|
+
|
|
831
|
+
project_path = project_manager.get_project_path(project_id)
|
|
832
|
+
|
|
833
|
+
with open(config_path) as f:
|
|
834
|
+
platform_config = yaml.safe_load(f)
|
|
835
|
+
|
|
836
|
+
self._update_config_for_project(platform_config, project_path, project_id)
|
|
837
|
+
|
|
838
|
+
# Export project id so reporting and upload paths use the correct namespace.
|
|
839
|
+
self._set_env_var("MLOPS_PROJECT_ID", str(project_id))
|
|
840
|
+
|
|
841
|
+
self._configure_logging(platform_config, project_path)
|
|
842
|
+
logger = logging.getLogger(__name__)
|
|
843
|
+
|
|
844
|
+
tracker = self._initialize_tracker(platform_config)
|
|
845
|
+
repro_manager = ReproducibilityManager(config_path, tracker_instance=tracker, project_path=project_path)
|
|
846
|
+
|
|
847
|
+
repro_manager.ensure_reproducibility_setup()
|
|
848
|
+
# Ensure both runtime and reporting environments are initialized so their interpreters are available
|
|
849
|
+
try:
|
|
850
|
+
repro_manager.setup_environment()
|
|
851
|
+
except Exception as _env_e:
|
|
852
|
+
logger.warning(f"Environment setup skipped or failed: {_env_e}")
|
|
853
|
+
|
|
854
|
+
# Export the project runtime and reporting interpreters so auto-installs go into the venvs
|
|
855
|
+
try:
|
|
856
|
+
if getattr(repro_manager, 'python_interpreter', None):
|
|
857
|
+
self._set_env_var("MLOPS_RUNTIME_PYTHON", str(repro_manager.python_interpreter))
|
|
858
|
+
except Exception:
|
|
859
|
+
pass
|
|
860
|
+
try:
|
|
861
|
+
rep_py = getattr(repro_manager, "reporting_python_interpreter", None)
|
|
862
|
+
if rep_py:
|
|
863
|
+
self._set_env_var("MLOPS_REPORTING_PYTHON", str(rep_py))
|
|
864
|
+
except Exception:
|
|
865
|
+
pass
|
|
866
|
+
|
|
867
|
+
# Export reporting config for workers (charts metadata and entrypoints)
|
|
868
|
+
try:
|
|
869
|
+
rep_cfg_in = (platform_config or {}).get('reporting', {}) or {}
|
|
870
|
+
rep_cfg: dict = rep_cfg_in if isinstance(rep_cfg_in, dict) else {}
|
|
871
|
+
# Include interpreter hints so chart runners can reliably select the reporting env
|
|
872
|
+
# even when env propagation is imperfect (e.g., distributed workers).
|
|
873
|
+
try:
|
|
874
|
+
if getattr(repro_manager, "python_interpreter", None):
|
|
875
|
+
rep_cfg.setdefault("runtime_python", str(repro_manager.python_interpreter))
|
|
876
|
+
rep_py = getattr(repro_manager, "reporting_python_interpreter", None)
|
|
877
|
+
if rep_py:
|
|
878
|
+
rep_cfg.setdefault("reporting_python", str(rep_py))
|
|
879
|
+
except Exception:
|
|
880
|
+
pass
|
|
881
|
+
self._set_env_var("MLOPS_REPORTING_CONFIG", json.dumps(rep_cfg))
|
|
882
|
+
except Exception:
|
|
883
|
+
pass
|
|
884
|
+
|
|
885
|
+
model_section = platform_config.get("model", {})
|
|
886
|
+
try:
|
|
887
|
+
repro_manager.apply_cloud_env_from_config(model_section)
|
|
888
|
+
repro_manager.ensure_cloud_dependencies(model_section)
|
|
889
|
+
except Exception as _cloud_e:
|
|
890
|
+
logger.warning(f"Cloud bootstrap skipped or failed: {_cloud_e}")
|
|
891
|
+
|
|
892
|
+
|
|
893
|
+
run_id = self._generate_run_id(platform_config, project_id)
|
|
894
|
+
run_metadata = self._prepare_run_metadata(platform_config, run_id)
|
|
895
|
+
|
|
896
|
+
# Keep this exact substring ("with run_id:") stable; cluster controller parses it.
|
|
897
|
+
print(f"Executing project '{project_id}' with run_id: {run_id}")
|
|
898
|
+
|
|
899
|
+
run_started = False
|
|
900
|
+
final_status = "FINISHED"
|
|
901
|
+
|
|
902
|
+
try:
|
|
903
|
+
tracker.start_run(**run_metadata)
|
|
904
|
+
run_started = True
|
|
905
|
+
model_config = AdapterConfig(**platform_config["model"])
|
|
906
|
+
|
|
907
|
+
# Build a typed run context so adapters/executors can avoid relying on implicit env vars.
|
|
908
|
+
try:
|
|
909
|
+
from mlops.runtime.context import RunContext
|
|
910
|
+
workspace_root = get_workspace_root()
|
|
911
|
+
cache_cfg = ((platform_config.get("model") or {}).get("parameters") or {}).get("cache") or {}
|
|
912
|
+
cache_cfg = cache_cfg if isinstance(cache_cfg, dict) else {}
|
|
913
|
+
backend_cfg = cache_cfg.get("backend") or {}
|
|
914
|
+
backend_cfg = backend_cfg if isinstance(backend_cfg, dict) else {}
|
|
915
|
+
reporting_cfg = (platform_config.get("reporting") or {}) if isinstance(platform_config.get("reporting"), dict) else {}
|
|
916
|
+
run_context = RunContext(
|
|
917
|
+
workspace_root=workspace_root,
|
|
918
|
+
project_id=str(project_id),
|
|
919
|
+
project_root=project_path,
|
|
920
|
+
run_id=str(run_id),
|
|
921
|
+
runtime_python=getattr(repro_manager, "python_interpreter", None),
|
|
922
|
+
reporting_python=getattr(repro_manager, "reporting_python_interpreter", None),
|
|
923
|
+
cache_backend=dict(backend_cfg),
|
|
924
|
+
cache_config=dict(cache_cfg),
|
|
925
|
+
reporting_config=dict(reporting_cfg),
|
|
926
|
+
)
|
|
927
|
+
except Exception:
|
|
928
|
+
run_context = None
|
|
929
|
+
|
|
930
|
+
adapter = self.adapter_manager.create_adapter(
|
|
931
|
+
platform_config["model"]["framework"],
|
|
932
|
+
model_config,
|
|
933
|
+
python_interpreter=repro_manager.python_interpreter,
|
|
934
|
+
environment_name=repro_manager.environment_name,
|
|
935
|
+
project_path=project_manager.get_project_path(project_id),
|
|
936
|
+
run_context=run_context,
|
|
937
|
+
)
|
|
938
|
+
|
|
939
|
+
if adapter is None:
|
|
940
|
+
raise ValueError(f"Could not create adapter for framework: {platform_config['model']['framework']}")
|
|
941
|
+
|
|
942
|
+
adapter.initialize()
|
|
943
|
+
try:
|
|
944
|
+
if hasattr(adapter, "set_tracker"):
|
|
945
|
+
adapter.set_tracker(tracker)
|
|
946
|
+
except Exception:
|
|
947
|
+
pass
|
|
948
|
+
|
|
949
|
+
pipeline_results = self._execute_pipeline(adapter, platform_config, run_id, tracker)
|
|
950
|
+
|
|
951
|
+
saved_artifact_paths = repro_manager.save_run_artifacts_locally(run_id, adapter)
|
|
952
|
+
pipeline_results["artifact_paths"] = saved_artifact_paths
|
|
953
|
+
|
|
954
|
+
config_hash = self._compute_config_hash(platform_config)
|
|
955
|
+
project_manager.add_run_to_project(project_id, run_id, config_hash)
|
|
956
|
+
|
|
957
|
+
print(f"Project '{project_id}' pipeline execution completed successfully!")
|
|
958
|
+
return pipeline_results
|
|
959
|
+
|
|
960
|
+
except Exception as e:
|
|
961
|
+
final_status = "FAILED"
|
|
962
|
+
print(f"Pipeline execution failed: {e}")
|
|
963
|
+
raise
|
|
964
|
+
finally:
|
|
965
|
+
if run_started:
|
|
966
|
+
try:
|
|
967
|
+
if hasattr(tracker, "run_active"):
|
|
968
|
+
if getattr(tracker, "run_active"):
|
|
969
|
+
tracker.end_run(status=final_status)
|
|
970
|
+
else:
|
|
971
|
+
tracker.end_run(status=final_status)
|
|
972
|
+
except Exception:
|
|
973
|
+
pass
|
|
974
|
+
|
|
975
|
+
def _update_config_for_project(self, config: Dict[str, Any], project_path: Path, project_id: str) -> None:
|
|
976
|
+
"""Update configuration paths to be project-specific."""
|
|
977
|
+
repro = config.get("reproducibility")
|
|
978
|
+
if isinstance(repro, dict):
|
|
979
|
+
artifacts_config = repro.get("artifacts")
|
|
980
|
+
if isinstance(artifacts_config, dict):
|
|
981
|
+
model_cfg = artifacts_config.get("model")
|
|
982
|
+
if isinstance(model_cfg, dict):
|
|
983
|
+
model_cfg["path"] = str(project_path / "artifacts" / "models")
|
|
984
|
+
data_cfg = artifacts_config.get("data")
|
|
985
|
+
if isinstance(data_cfg, dict):
|
|
986
|
+
data_cfg["path"] = str(project_path / "artifacts" / "data")
|
|
987
|
+
|
|
988
|
+
tracking_config = repro.get("experiment_tracking")
|
|
989
|
+
if isinstance(tracking_config, dict):
|
|
990
|
+
params = tracking_config.get("parameters")
|
|
991
|
+
if isinstance(params, dict):
|
|
992
|
+
tracking_uri = params.get("tracking_uri")
|
|
993
|
+
if isinstance(tracking_uri, str) and "sqlite" in tracking_uri:
|
|
994
|
+
params["tracking_uri"] = f"sqlite:///{project_path}/artifacts/experiments.db"
|
|
995
|
+
|
|
996
|
+
|