expops 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. expops-0.1.3.dist-info/METADATA +826 -0
  2. expops-0.1.3.dist-info/RECORD +86 -0
  3. expops-0.1.3.dist-info/WHEEL +5 -0
  4. expops-0.1.3.dist-info/entry_points.txt +3 -0
  5. expops-0.1.3.dist-info/licenses/LICENSE +674 -0
  6. expops-0.1.3.dist-info/top_level.txt +1 -0
  7. mlops/__init__.py +0 -0
  8. mlops/__main__.py +11 -0
  9. mlops/_version.py +34 -0
  10. mlops/adapters/__init__.py +12 -0
  11. mlops/adapters/base.py +86 -0
  12. mlops/adapters/config_schema.py +89 -0
  13. mlops/adapters/custom/__init__.py +3 -0
  14. mlops/adapters/custom/custom_adapter.py +447 -0
  15. mlops/adapters/plugin_manager.py +113 -0
  16. mlops/adapters/sklearn/__init__.py +3 -0
  17. mlops/adapters/sklearn/adapter.py +94 -0
  18. mlops/cluster/__init__.py +3 -0
  19. mlops/cluster/controller.py +496 -0
  20. mlops/cluster/process_runner.py +91 -0
  21. mlops/cluster/providers.py +258 -0
  22. mlops/core/__init__.py +95 -0
  23. mlops/core/custom_model_base.py +38 -0
  24. mlops/core/dask_networkx_executor.py +1265 -0
  25. mlops/core/executor_worker.py +1239 -0
  26. mlops/core/experiment_tracker.py +81 -0
  27. mlops/core/graph_types.py +64 -0
  28. mlops/core/networkx_parser.py +135 -0
  29. mlops/core/payload_spill.py +278 -0
  30. mlops/core/pipeline_utils.py +162 -0
  31. mlops/core/process_hashing.py +216 -0
  32. mlops/core/step_state_manager.py +1298 -0
  33. mlops/core/step_system.py +956 -0
  34. mlops/core/workspace.py +99 -0
  35. mlops/environment/__init__.py +10 -0
  36. mlops/environment/base.py +43 -0
  37. mlops/environment/conda_manager.py +307 -0
  38. mlops/environment/factory.py +70 -0
  39. mlops/environment/pyenv_manager.py +146 -0
  40. mlops/environment/setup_env.py +31 -0
  41. mlops/environment/system_manager.py +66 -0
  42. mlops/environment/utils.py +105 -0
  43. mlops/environment/venv_manager.py +134 -0
  44. mlops/main.py +527 -0
  45. mlops/managers/project_manager.py +400 -0
  46. mlops/managers/reproducibility_manager.py +575 -0
  47. mlops/platform.py +996 -0
  48. mlops/reporting/__init__.py +16 -0
  49. mlops/reporting/context.py +187 -0
  50. mlops/reporting/entrypoint.py +292 -0
  51. mlops/reporting/kv_utils.py +77 -0
  52. mlops/reporting/registry.py +50 -0
  53. mlops/runtime/__init__.py +9 -0
  54. mlops/runtime/context.py +34 -0
  55. mlops/runtime/env_export.py +113 -0
  56. mlops/storage/__init__.py +12 -0
  57. mlops/storage/adapters/__init__.py +9 -0
  58. mlops/storage/adapters/gcp_kv_store.py +778 -0
  59. mlops/storage/adapters/gcs_object_store.py +96 -0
  60. mlops/storage/adapters/memory_store.py +240 -0
  61. mlops/storage/adapters/redis_store.py +438 -0
  62. mlops/storage/factory.py +199 -0
  63. mlops/storage/interfaces/__init__.py +6 -0
  64. mlops/storage/interfaces/kv_store.py +118 -0
  65. mlops/storage/path_utils.py +38 -0
  66. mlops/templates/premier-league/charts/plot_metrics.js +70 -0
  67. mlops/templates/premier-league/charts/plot_metrics.py +145 -0
  68. mlops/templates/premier-league/charts/requirements.txt +6 -0
  69. mlops/templates/premier-league/configs/cluster_config.yaml +13 -0
  70. mlops/templates/premier-league/configs/project_config.yaml +207 -0
  71. mlops/templates/premier-league/data/England CSV.csv +12154 -0
  72. mlops/templates/premier-league/models/premier_league_model.py +638 -0
  73. mlops/templates/premier-league/requirements.txt +8 -0
  74. mlops/templates/sklearn-basic/README.md +22 -0
  75. mlops/templates/sklearn-basic/charts/plot_metrics.py +85 -0
  76. mlops/templates/sklearn-basic/charts/requirements.txt +3 -0
  77. mlops/templates/sklearn-basic/configs/project_config.yaml +64 -0
  78. mlops/templates/sklearn-basic/data/train.csv +14 -0
  79. mlops/templates/sklearn-basic/models/model.py +62 -0
  80. mlops/templates/sklearn-basic/requirements.txt +10 -0
  81. mlops/web/__init__.py +3 -0
  82. mlops/web/server.py +585 -0
  83. mlops/web/ui/index.html +52 -0
  84. mlops/web/ui/mlops-charts.js +357 -0
  85. mlops/web/ui/script.js +1244 -0
  86. mlops/web/ui/styles.css +248 -0
mlops/platform.py ADDED
@@ -0,0 +1,996 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Any, Dict
5
+
6
+ from datetime import datetime
7
+ import hashlib
8
+ import json
9
+ import logging
10
+ import os
11
+ import re
12
+ import subprocess
13
+ import sys
14
+ import time
15
+ import uuid
16
+
17
+ import yaml
18
+
19
+ from .adapters.plugin_manager import AdapterPluginManager
20
+ from .adapters.config_schema import AdapterConfig
21
+ from .managers.reproducibility_manager import ReproducibilityManager
22
+ from .managers.project_manager import ProjectManager
23
+ from .core.experiment_tracker import ExperimentTracker, NoOpExperimentTracker
24
+ from .core.workspace import get_projects_root, get_workspace_root, infer_source_root
25
+
26
+
27
+ class MLPlatform:
28
+ """Main platform class that orchestrates the ML pipeline."""
29
+
30
+ def __init__(self) -> None:
31
+ self.adapter_manager = AdapterPluginManager()
32
+ # Adapter discovery already tries both `mlops.*` and `src.mlops.*` layouts.
33
+ self.adapter_manager.discover_adapters("mlops.adapters")
34
+
35
+ # Track output directories of dynamic charts so we can upload artifacts later.
36
+ self._dynamic_chart_outputs: list[tuple[str, Path]] = []
37
+
38
+ def _initialize_tracker(self, platform_config: Dict[str, Any]) -> ExperimentTracker:
39
+ """Initialize experiment tracker from configuration."""
40
+ logger = logging.getLogger(__name__)
41
+ tracking_config = platform_config.get("reproducibility", {}).get("experiment_tracking", {})
42
+ tracker_name = str(tracking_config.get("backend", "noop") or "noop").strip().lower()
43
+ tracker_params = tracking_config.get("parameters", {})
44
+ if not isinstance(tracker_params, dict):
45
+ tracker_params = {}
46
+
47
+ # NOTE: The platform's primary metrics path is `mlops.core.step_system.log_metric`
48
+ # (KV-store based). External experiment tracking backends are optional.
49
+ if tracker_name not in {"noop"}:
50
+ logger.warning(
51
+ f"Experiment tracker backend '{tracker_name}' is not available in this build. "
52
+ f"Falling back to NoOpExperimentTracker."
53
+ )
54
+
55
+ return NoOpExperimentTracker(config=tracker_params)
56
+
57
+ def _repo_root(self) -> Path:
58
+ # Legacy name; this is now the workspace root (where projects/ lives).
59
+ try:
60
+ return get_workspace_root()
61
+ except Exception:
62
+ return Path.cwd()
63
+
64
+ def _set_env_var(self, key: str, value: str) -> None:
65
+ """Best-effort environment variable setter (never raises)."""
66
+ try:
67
+ os.environ[key] = value
68
+ except Exception:
69
+ pass
70
+
71
+ def _get_reporting_python_exec(self) -> str:
72
+ try:
73
+ return os.environ.get("MLOPS_REPORTING_PYTHON") or os.environ.get("MLOPS_RUNTIME_PYTHON") or sys.executable
74
+ except Exception:
75
+ return sys.executable
76
+
77
+ def _in_distributed_mode(self) -> bool:
78
+ return bool(os.environ.get("DASK_SCHEDULER_ADDRESS") or os.environ.get("MLOPS_CLUSTER_MODE"))
79
+
80
+ def _get_project_id_from_adapter(self, adapter: Any) -> str:
81
+ try:
82
+ ssm = getattr(adapter, 'step_state_manager', None)
83
+ kv = getattr(ssm, 'kv_store', None) if ssm else None
84
+ return (getattr(kv, 'project_id', None) if kv else None) or os.environ.get("MLOPS_PROJECT_ID") or ""
85
+ except Exception:
86
+ return os.environ.get("MLOPS_PROJECT_ID") or ""
87
+
88
+ def _get_project_dir_hint(self, project_id: str):
89
+ try:
90
+ pm = ProjectManager()
91
+ return pm.get_project_path(project_id)
92
+ except Exception:
93
+ return None
94
+
95
+ def _resolve_entrypoint_path(self, entrypoint: str, project_dir_hint: Path | None = None) -> Path | None:
96
+ try:
97
+ ep = Path(entrypoint)
98
+ if ep.is_absolute() and ep.exists():
99
+ return ep
100
+ if ep.exists():
101
+ return ep
102
+ if project_dir_hint:
103
+ cand = (Path(project_dir_hint) / entrypoint)
104
+ if cand.exists():
105
+ return cand
106
+ ws = self._repo_root()
107
+ cand = (ws / entrypoint)
108
+ return cand if cand.exists() else None
109
+ except Exception:
110
+ return None
111
+
112
+ def _default_reporting_entrypoint_path(self) -> Path | None:
113
+ """Return the built-in reporting entrypoint file path (inside the installed package)."""
114
+ try:
115
+ import mlops.reporting.entrypoint as _entry
116
+ p = Path(getattr(_entry, "__file__", "") or "")
117
+ return p if p.exists() else None
118
+ except Exception:
119
+ return None
120
+
121
+ def _maybe_apply_cache_env(self, env: dict, platform_config: Dict[str, Any], project_dir_hint) -> None:
122
+ cache_cfg = ((platform_config.get("model") or {}).get("parameters") or {}).get("cache") or {}
123
+ if not isinstance(cache_cfg, dict):
124
+ return
125
+ backend_cfg = cache_cfg.get("backend") or {}
126
+ if not isinstance(backend_cfg, dict):
127
+ return
128
+
129
+ gcp_project = backend_cfg.get("gcp_project")
130
+ if gcp_project:
131
+ env["GOOGLE_CLOUD_PROJECT"] = str(gcp_project)
132
+ emulator_host = backend_cfg.get("emulator_host")
133
+ if emulator_host:
134
+ env["FIRESTORE_EMULATOR_HOST"] = str(emulator_host)
135
+ creds_path = backend_cfg.get("credentials_json")
136
+ if creds_path:
137
+ try:
138
+ creds_path_val = str(creds_path)
139
+ if not Path(creds_path_val).is_absolute() and project_dir_hint:
140
+ creds_path_val = str(Path(project_dir_hint) / creds_path_val)
141
+ env["GOOGLE_APPLICATION_CREDENTIALS"] = creds_path_val
142
+ except Exception:
143
+ pass
144
+
145
+ def _ensure_repo_src_on_pythonpath(self, env: dict) -> None:
146
+ src_root = infer_source_root()
147
+ if not src_root:
148
+ return
149
+ repo_src = str(src_root / "src")
150
+ prev_pp = str(env.get("PYTHONPATH", "") or "")
151
+ if repo_src in prev_pp.split(":"):
152
+ return
153
+ env["PYTHONPATH"] = f"{repo_src}:{prev_pp}".rstrip(":")
154
+
155
+ def _compute_config_hash(self, config_content: Dict[str, Any]) -> str:
156
+ """Compute a stable hash of the configuration content (excluding run_id)."""
157
+ logger = logging.getLogger(__name__)
158
+ try:
159
+ config_copy = dict(config_content)
160
+ config_copy.pop("run_id", None)
161
+
162
+ config_str = json.dumps(config_copy, sort_keys=True, default=str, separators=(",", ":"))
163
+
164
+ return hashlib.sha256(config_str.encode()).hexdigest()
165
+ except Exception as e:
166
+ logger.warning(f"Failed to compute config hash: {e}. Falling back to random UUID-based hash.")
167
+ return str(uuid.uuid4()).replace("-", "")[:16]
168
+
169
+ def _generate_run_id(self, platform_config: Dict[str, Any], project_id: str | None = None) -> str:
170
+ """Generate or extract run ID from configuration."""
171
+ run_id_from_config = platform_config.get("run_id")
172
+ if run_id_from_config and run_id_from_config not in ["${RUN_ID:-auto-generated}", "auto-generated"]:
173
+ return str(run_id_from_config)
174
+
175
+ # Always add a unique suffix so every execution has a distinct run_id
176
+ unique_suffix = datetime.utcnow().strftime("%Y%m%d%H%M%S") + "-" + uuid.uuid4().hex[:8]
177
+ if project_id:
178
+ return f"project-{project_id}-{unique_suffix}"
179
+ return f"config-{unique_suffix}"
180
+
181
+ def _prepare_run_metadata(self, platform_config: Dict[str, Any], run_id: str) -> Dict[str, Any]:
182
+ """Prepare run metadata for tracking."""
183
+ run_name = platform_config.get("metadata", {}).get("name", "ml-pipeline-run")
184
+ run_tags = platform_config.get("metadata", {}).get("tags", {})
185
+ if isinstance(run_tags, list):
186
+ run_tags = {tag: "true" for tag in run_tags}
187
+
188
+ return {
189
+ "run_name": f"{run_name}-{run_id[:8]}",
190
+ "run_id": run_id,
191
+ "tags": run_tags
192
+ }
193
+
194
+ def _execute_pipeline(self, adapter, platform_config: Dict[str, Any], run_id: str, tracker: ExperimentTracker) -> Dict[str, Any]:
195
+ """Execute the ML pipeline using the specified adapter."""
196
+ logger = logging.getLogger(__name__)
197
+ print(f"[MLPlatform] Starting pipeline execution for run_id: {run_id}")
198
+
199
+ try:
200
+ self._preregister_chart_probe_paths(adapter, platform_config, run_id)
201
+ except Exception as e:
202
+ logger.warning(f"Failed to pre-register chart probe paths: {e}")
203
+
204
+ dynamic_chart_processes = []
205
+ try:
206
+ reporting_cfg = (platform_config or {}).get("reporting") or {}
207
+ dyn_entry = str(reporting_cfg.get("dynamic_entrypoint") or "").strip() if isinstance(reporting_cfg, dict) else ""
208
+ if dyn_entry:
209
+ dynamic_chart_processes = self._start_dynamic_charts(adapter, platform_config, run_id)
210
+ except Exception as _dynamic_e:
211
+ logger.warning(f"Failed to start dynamic charts: {_dynamic_e}")
212
+
213
+ data_sources = platform_config.get("data", {}).get("sources", {})
214
+ training_params = platform_config.get("training", {}).get("parameters", {})
215
+
216
+ training_path_cfg = data_sources.get("training") or {}
217
+ validation_path_cfg = data_sources.get("validation") or {}
218
+
219
+ training_data_path = Path(training_path_cfg["path"]) if isinstance(training_path_cfg, dict) and training_path_cfg.get("path") else None
220
+ validation_data_path = Path(validation_path_cfg["path"]) if isinstance(validation_path_cfg, dict) and validation_path_cfg.get("path") else None
221
+
222
+ adapter_kwargs = dict(training_params) if isinstance(training_params, dict) else {}
223
+ adapter_kwargs["data_paths"] = {}
224
+ if training_data_path:
225
+ adapter_kwargs["data_paths"]["training"] = training_data_path
226
+ if validation_data_path:
227
+ adapter_kwargs["data_paths"]["validation"] = validation_data_path
228
+
229
+ # Provide selected top-level sections to adapters/workers for cache hashing hooks.
230
+ full_cfg_hash = self._compute_config_hash(platform_config)
231
+ adapter_kwargs["global_config_overrides"] = {
232
+ "reproducibility": (platform_config.get("reproducibility", {}) or {}),
233
+ "data": (platform_config.get("data", {}) or {}),
234
+ "project_config_file_hash": full_cfg_hash,
235
+ }
236
+
237
+ # Give adapters a chance to attach the tracker instance if they support it.
238
+ try:
239
+ if hasattr(adapter, "set_tracker"):
240
+ adapter.set_tracker(tracker)
241
+ except Exception:
242
+ pass
243
+
244
+ # Pass run_id + tracker down so adapters can keep storage/tracking consistent.
245
+ pipeline_results = adapter.run(
246
+ data_paths=adapter_kwargs.get("data_paths"),
247
+ run_id=run_id,
248
+ tracker=tracker,
249
+ **{k: v for k, v in adapter_kwargs.items() if k != "data_paths"}
250
+ )
251
+ print(f"[MLPlatform] Pipeline completed")
252
+
253
+ try:
254
+ if dynamic_chart_processes:
255
+ is_distributed = self._in_distributed_mode()
256
+ if is_distributed:
257
+ logger.info(f"Dynamic chart jobs submitted to cluster: {', '.join(dynamic_chart_processes)}")
258
+ else:
259
+ pids = ", ".join([str(p.pid) for p in dynamic_chart_processes if getattr(p, "pid", None)])
260
+ logger.info(f"Dynamic chart(s) running in background (PIDs: {pids})")
261
+
262
+ # Best-effort: upload artifacts produced by dynamic charts (async)
263
+ try:
264
+ self._upload_dynamic_chart_artifacts_async(adapter, run_id)
265
+ except Exception as _dyn_up_e:
266
+ logger.warning(f"Dynamic chart artifact upload failed: {_dyn_up_e}")
267
+ except Exception as _report_e:
268
+ logger.warning(f"Reporting failed: {_report_e}")
269
+
270
+ return {
271
+ "run_id": run_id,
272
+ "pipeline_results": pipeline_results
273
+ }
274
+
275
+ def _preregister_chart_probe_paths(self, adapter: Any, platform_config: Dict[str, Any], run_id: str) -> None:
276
+ """No-op retained for backwards compatibility."""
277
+ logging.getLogger(__name__).debug("Probe path pre-registration is a no-op (path-based metrics).")
278
+
279
+ def _get_reporting_cfg(self, platform_config: Dict[str, Any]) -> Dict[str, Any]:
280
+ cfg = (platform_config or {}).get("reporting") or {}
281
+ return cfg if isinstance(cfg, dict) else {}
282
+
283
+ def _get_reporting_entrypoint(self, reporting_cfg: Dict[str, Any]) -> str:
284
+ return str(reporting_cfg.get("static_entrypoint") or reporting_cfg.get("entrypoint") or "").strip()
285
+
286
+ def _get_dynamic_chart_specs(self, reporting_cfg: Dict[str, Any]) -> list[dict]:
287
+ charts = reporting_cfg.get("charts") or []
288
+ if not isinstance(charts, list):
289
+ return []
290
+ out: list[dict] = []
291
+ for c in charts:
292
+ if isinstance(c, dict) and str(c.get("type", "")).lower() == "dynamic":
293
+ out.append(c)
294
+ return out
295
+
296
+ def _resolve_reporting_entry_to_run(self, configured_entry: Path) -> tuple[Path, str | None]:
297
+ """Resolve the actual script/module to run for reporting.
298
+
299
+ If config points to a user script, run the framework entrypoint and import the user file.
300
+ """
301
+ default_entry = self._default_reporting_entrypoint_path()
302
+ if default_entry is None:
303
+ return configured_entry, None
304
+ try:
305
+ if configured_entry.resolve() != default_entry.resolve():
306
+ return default_entry, str(configured_entry)
307
+ except Exception:
308
+ pass
309
+ return default_entry, None
310
+
311
+ def _start_dynamic_charts(self, adapter: Any, platform_config: Dict[str, Any], run_id: str) -> list:
312
+ """Start dynamic charts as background processes (local) or cluster jobs (distributed).
313
+
314
+ Returns list of subprocess.Popen objects (local) or job IDs (distributed) for tracking.
315
+ """
316
+ logger = logging.getLogger(__name__)
317
+
318
+ # Detect if we're in cluster/distributed mode
319
+ is_distributed = self._in_distributed_mode()
320
+
321
+ if is_distributed:
322
+ logger.info("Detected distributed mode - will submit dynamic charts as cluster jobs")
323
+ return self._start_dynamic_charts_distributed(adapter, platform_config, run_id)
324
+ else:
325
+ logger.info("Local mode - will run dynamic charts as background processes")
326
+ return self._start_dynamic_charts_local(adapter, platform_config, run_id)
327
+
328
+ def _start_dynamic_charts_local(self, adapter: Any, platform_config: Dict[str, Any], run_id: str) -> list:
329
+ """Start dynamic charts as local background processes.
330
+
331
+ Returns list of subprocess.Popen objects for the started dynamic chart processes.
332
+ """
333
+ logger = logging.getLogger(__name__)
334
+
335
+ project_id = self._get_project_id_from_adapter(adapter)
336
+
337
+ reporting_cfg = self._get_reporting_cfg(platform_config)
338
+ entrypoint = self._get_reporting_entrypoint(reporting_cfg)
339
+ if not entrypoint:
340
+ return []
341
+
342
+ args = list(reporting_cfg.get("args") or [])
343
+
344
+ dynamic_charts = self._get_dynamic_chart_specs(reporting_cfg)
345
+ if not dynamic_charts:
346
+ return []
347
+
348
+ reporting_python = self._get_reporting_python_exec()
349
+
350
+ project_dir_hint = self._get_project_dir_hint(project_id)
351
+ entry = self._resolve_entrypoint_path(entrypoint, project_dir_hint=project_dir_hint)
352
+ if not entry:
353
+ logger.warning(f"Reporting entrypoint not found: {entrypoint}")
354
+ return []
355
+
356
+ # Output under the project artifacts directory
357
+ try:
358
+ if project_dir_hint:
359
+ output_base = Path(project_dir_hint) / "artifacts" / "charts" / run_id
360
+ else:
361
+ output_base = get_projects_root(self._repo_root()) / project_id / "artifacts" / "charts" / run_id
362
+ except Exception:
363
+ output_base = Path.cwd() / "projects" / project_id / "artifacts" / "charts" / run_id
364
+
365
+ default_entry = self._default_reporting_entrypoint_path()
366
+
367
+ dynamic_processes = []
368
+
369
+ for spec in dynamic_charts:
370
+ name = str(spec.get("name") or "dynamic_chart").strip()
371
+ chart_out = output_base / name / time.strftime("%Y%m%d_%H%M%S")
372
+
373
+ try:
374
+ chart_out.mkdir(parents=True, exist_ok=True)
375
+ except Exception:
376
+ pass
377
+ # Track for later upload
378
+ try:
379
+ self._dynamic_chart_outputs.append((name, chart_out))
380
+ except Exception:
381
+ pass
382
+
383
+ env = os.environ.copy()
384
+ applied_run_env = False
385
+ try:
386
+ # Centralized env export for process boundaries (best-effort).
387
+ from mlops.runtime.env_export import export_run_env
388
+
389
+ rc = getattr(adapter, "run_context", None)
390
+ if rc is not None:
391
+ env.update(export_run_env(rc))
392
+ applied_run_env = True
393
+ except Exception:
394
+ applied_run_env = False
395
+ if project_id:
396
+ env["MLOPS_PROJECT_ID"] = project_id
397
+ env["MLOPS_OUTPUT_DIR"] = str(chart_out)
398
+ env["MLOPS_CHART_NAME"] = name
399
+ env["MLOPS_RUN_ID"] = run_id
400
+ env["MLOPS_CHART_TYPE"] = "dynamic"
401
+
402
+ if not applied_run_env:
403
+ self._maybe_apply_cache_env(env, platform_config, project_dir_hint)
404
+
405
+ if "probe_paths" in spec:
406
+ try:
407
+ env["MLOPS_PROBE_PATHS"] = json.dumps(spec.get("probe_paths"))
408
+ except Exception:
409
+ pass
410
+
411
+ entry_to_run, import_file = self._resolve_reporting_entry_to_run(entry)
412
+ try:
413
+ if import_file:
414
+ env["MLOPS_CHART_IMPORT_FILES"] = import_file
415
+ except Exception:
416
+ pass
417
+
418
+ try:
419
+ self._ensure_repo_src_on_pythonpath(env)
420
+ except Exception as _path_e:
421
+ logger.warning(f"Failed to set PYTHONPATH for dynamic chart '{name}': {_path_e}")
422
+
423
+ spec_args = list(spec.get("args") or [])
424
+ run_as_module = bool(default_entry and entry_to_run and entry_to_run.resolve() == default_entry.resolve())
425
+ if run_as_module:
426
+ cmd = [reporting_python, "-u", "-m", "mlops.reporting.entrypoint"] + args + spec_args
427
+ else:
428
+ cmd = [reporting_python, "-u", str(entry_to_run)] + args + spec_args
429
+
430
+ try:
431
+ logger.info(f"Starting dynamic chart '{name}' in background -> {chart_out}")
432
+ stdout_log = chart_out / "stdout.log"
433
+ stderr_log = chart_out / "stderr.log"
434
+
435
+ stdout_file = open(stdout_log, "a", buffering=1)
436
+ stderr_file = open(stderr_log, "a", buffering=1)
437
+ try:
438
+ proc = subprocess.Popen(cmd, env=env, stdout=stdout_file, stderr=stderr_file, cwd=str(self._repo_root()))
439
+ finally:
440
+ # Close in parent; child keeps its own fds.
441
+ try:
442
+ stdout_file.close()
443
+ except Exception:
444
+ pass
445
+ try:
446
+ stderr_file.close()
447
+ except Exception:
448
+ pass
449
+ dynamic_processes.append(proc)
450
+ logger.info(f"Dynamic chart '{name}' started with PID {proc.pid}, logs: stdout={stdout_log}, stderr={stderr_log}")
451
+ except Exception as _e:
452
+ logger.warning(f"Failed to start dynamic chart '{name}': {_e}")
453
+
454
+ if dynamic_processes:
455
+ logger.info(f"Started {len(dynamic_processes)} dynamic chart(s) in background")
456
+
457
+ return dynamic_processes
458
+
459
+ def _start_dynamic_charts_distributed(self, adapter: Any, platform_config: Dict[str, Any], run_id: str) -> list:
460
+ """Start dynamic charts via the configured cluster provider.
461
+
462
+ Returns list of provider-specific job identifiers (strings)."""
463
+ provider = self._get_cluster_provider_name(adapter)
464
+ if provider == "slurm":
465
+ return self._start_dynamic_charts_distributed_slurm(adapter, platform_config, run_id)
466
+ elif provider == "ansible":
467
+ # Run on head node as a fallback; return labels as job ids
468
+ procs = self._start_dynamic_charts_local(adapter, platform_config, run_id)
469
+ return [f"pid-{getattr(p, 'pid', 'unknown')}" for p in (procs or [])]
470
+ else:
471
+ logging.getLogger(__name__).warning(f"Unknown cluster provider '{provider}'. Falling back to local dynamic charts.")
472
+ procs = self._start_dynamic_charts_local(adapter, platform_config, run_id)
473
+ return [f"pid-{getattr(p, 'pid', 'unknown')}" for p in (procs or [])]
474
+
475
+ def _get_cluster_provider_name(self, adapter: Any) -> str:
476
+ """Determine the cluster provider from env or project cluster_config.yaml."""
477
+ try:
478
+ prov = os.environ.get("MLOPS_CLUSTER_PROVIDER")
479
+ if isinstance(prov, str) and prov.strip():
480
+ return prov.strip().lower()
481
+ except Exception:
482
+ pass
483
+ # Try project cluster_config.yaml
484
+ project_id = self._get_project_id_from_adapter(adapter)
485
+ project_dir_hint = self._get_project_dir_hint(project_id)
486
+ if project_dir_hint:
487
+ cfg_path = Path(project_dir_hint) / "configs" / "cluster_config.yaml"
488
+ else:
489
+ cfg_path = get_projects_root(self._repo_root()) / project_id / "configs" / "cluster_config.yaml"
490
+ try:
491
+ if cfg_path.exists():
492
+ with open(cfg_path) as f:
493
+ data = yaml.safe_load(f) or {}
494
+ provider = data.get("provider")
495
+ if isinstance(provider, str) and provider.strip():
496
+ return provider.strip().lower()
497
+ except Exception:
498
+ pass
499
+ return "slurm"
500
+
501
+ def _start_dynamic_charts_distributed_slurm(self, adapter: Any, platform_config: Dict[str, Any], run_id: str) -> list:
502
+ """Start dynamic charts by submitting SLURM sbatch jobs."""
503
+ logger = logging.getLogger(__name__)
504
+
505
+ project_id = self._get_project_id_from_adapter(adapter)
506
+
507
+ reporting_cfg = self._get_reporting_cfg(platform_config)
508
+ entrypoint = self._get_reporting_entrypoint(reporting_cfg)
509
+ if not entrypoint:
510
+ return []
511
+
512
+ args = list(reporting_cfg.get("args") or [])
513
+ dynamic_charts = self._get_dynamic_chart_specs(reporting_cfg)
514
+ if not dynamic_charts:
515
+ return []
516
+
517
+ reporting_python = self._get_reporting_python_exec()
518
+ project_dir_hint = self._get_project_dir_hint(project_id)
519
+ entry = self._resolve_entrypoint_path(entrypoint, project_dir_hint=project_dir_hint)
520
+ if not entry:
521
+ logger.warning(f"Reporting entrypoint not found: {entrypoint}")
522
+ return []
523
+
524
+ # Output under the project artifacts directory
525
+ try:
526
+ if project_dir_hint:
527
+ output_base = Path(project_dir_hint) / "artifacts" / "charts" / run_id
528
+ else:
529
+ output_base = get_projects_root(self._repo_root()) / project_id / "artifacts" / "charts" / run_id
530
+ except Exception:
531
+ output_base = Path.cwd() / "projects" / project_id / "artifacts" / "charts" / run_id
532
+
533
+ default_entry = self._default_reporting_entrypoint_path()
534
+ entry_to_run, import_file = self._resolve_reporting_entry_to_run(entry)
535
+ run_as_module = bool(default_entry and entry_to_run and entry_to_run.resolve() == default_entry.resolve())
536
+
537
+ job_ids: list[str] = []
538
+ for spec in dynamic_charts:
539
+ name = str(spec.get("name") or "dynamic_chart").strip()
540
+ chart_out = output_base / name / time.strftime("%Y%m%d_%H%M%S")
541
+ try:
542
+ chart_out.mkdir(parents=True, exist_ok=True)
543
+ except Exception:
544
+ pass
545
+ try:
546
+ self._dynamic_chart_outputs.append((name, chart_out))
547
+ except Exception:
548
+ pass
549
+
550
+ env_vars = []
551
+ if project_id:
552
+ env_vars.append(f"MLOPS_PROJECT_ID={project_id}")
553
+ try:
554
+ env_vars.append(f"MLOPS_WORKSPACE_DIR={self._repo_root()}")
555
+ except Exception:
556
+ pass
557
+ env_vars.append(f"MLOPS_OUTPUT_DIR={chart_out}")
558
+ env_vars.append(f"MLOPS_CHART_NAME={name}")
559
+ env_vars.append(f"MLOPS_RUN_ID={run_id}")
560
+ env_vars.append(f"MLOPS_CHART_TYPE=dynamic")
561
+
562
+ # Add import file if using user script
563
+ if import_file:
564
+ env_vars.append(f"MLOPS_CHART_IMPORT_FILES={import_file}")
565
+
566
+ tmp_env = {}
567
+ self._maybe_apply_cache_env(tmp_env, platform_config, project_dir_hint)
568
+ for k, v in tmp_env.items():
569
+ env_vars.append(f"{k}={v}")
570
+
571
+ if "probe_paths" in spec:
572
+ try:
573
+ probe_paths_json = json.dumps(spec.get("probe_paths")).replace('"', '\\"')
574
+ env_vars.append(f'MLOPS_PROBE_PATHS="{probe_paths_json}"')
575
+ except Exception:
576
+ pass
577
+
578
+ try:
579
+ src_root = infer_source_root()
580
+ if src_root and (src_root / "src").exists():
581
+ repo_src = str(src_root / "src")
582
+ env_vars.append(f"PYTHONPATH={repo_src}:$PYTHONPATH")
583
+ except Exception:
584
+ pass
585
+
586
+ chart_cmd_args = args + list(spec.get("args") or [])
587
+
588
+ if run_as_module:
589
+ chart_cmd = f"{reporting_python} -u -m mlops.reporting.entrypoint {' '.join(chart_cmd_args)}"
590
+ else:
591
+ chart_cmd = f"{reporting_python} -u {entry_to_run} {' '.join(chart_cmd_args)}"
592
+
593
+ sbatch_script = f"""#!/bin/bash
594
+ #SBATCH --job-name={name}
595
+ #SBATCH --output={chart_out}/slurm-%j.out
596
+ #SBATCH --error={chart_out}/slurm-%j.err
597
+ #SBATCH --time=01:00:00
598
+ #SBATCH --ntasks=1
599
+ #SBATCH --cpus-per-task=1
600
+ #SBATCH --mem=4G
601
+
602
+ # Export environment variables
603
+ {chr(10).join(f'export {var}' for var in env_vars)}
604
+
605
+ # Run the chart
606
+ {chart_cmd}
607
+ """
608
+
609
+ script_path = chart_out / f"submit_{name}.sh"
610
+ try:
611
+ script_path.write_text(sbatch_script)
612
+ script_path.chmod(0o755)
613
+ except Exception as e:
614
+ logger.warning(f"Failed to write sbatch script for '{name}': {e}")
615
+ continue
616
+
617
+ try:
618
+ logger.info(f"Submitting dynamic chart '{name}' to cluster -> {chart_out}")
619
+ result = subprocess.run(["sbatch", str(script_path)], capture_output=True, text=True, check=True)
620
+ output = result.stdout.strip()
621
+ if "Submitted batch job" in output:
622
+ job_id = output.split()[-1]
623
+ job_ids.append(job_id)
624
+ logger.info(f"Dynamic chart '{name}' submitted as job {job_id}")
625
+ else:
626
+ logger.warning(f"Unexpected sbatch output for '{name}': {output}")
627
+ except Exception as e:
628
+ logger.warning(f"Failed to submit dynamic chart '{name}': {e}")
629
+
630
+ if job_ids:
631
+ logger.info(f"Submitted {len(job_ids)} dynamic chart(s) to cluster: {', '.join(job_ids)}")
632
+ return job_ids
633
+
634
+ def _upload_dynamic_chart_artifacts_async(self, adapter: Any, run_id: str) -> None:
635
+ """Upload PNG artifacts from dynamic chart output directories asynchronously.
636
+
637
+ This runs in a background thread to avoid blocking the main execution flow.
638
+ """
639
+ import threading
640
+ logger = logging.getLogger(__name__)
641
+
642
+ def _upload_worker():
643
+ try:
644
+ self._upload_dynamic_chart_artifacts(adapter, run_id)
645
+ except Exception as e:
646
+ logger.warning(f"Async dynamic chart upload failed: {e}")
647
+
648
+ # Start upload in background thread
649
+ upload_thread = threading.Thread(target=_upload_worker, daemon=True)
650
+ upload_thread.start()
651
+ logger.info("Dynamic chart artifact upload started in background")
652
+
653
+ def _upload_dynamic_chart_artifacts(self, adapter: Any, run_id: str) -> None:
654
+ """Upload PNG artifacts from dynamic chart output directories and record them in KV.
655
+
656
+ Best-effort: skips silently if directories are missing or object store is unavailable.
657
+ """
658
+ logger = logging.getLogger(__name__)
659
+ try:
660
+ tracked: list = getattr(self, "_dynamic_chart_outputs", []) or []
661
+ except Exception:
662
+ tracked = []
663
+ if not tracked:
664
+ return
665
+ uploaded_any = False
666
+ for item in tracked:
667
+ try:
668
+ name, chart_out = item
669
+ except Exception:
670
+ continue
671
+ try:
672
+ # Ensure path is a Path
673
+ chart_out = Path(chart_out)
674
+ except Exception:
675
+ continue
676
+ if not chart_out.exists():
677
+ # Skip missing dirs (e.g., remote-only paths)
678
+ continue
679
+ try:
680
+ self._upload_single_chart_artifacts(adapter, run_id, name, chart_out, chart_type="dynamic")
681
+ uploaded_any = True
682
+ except Exception as _e:
683
+ logger.warning(f"Dynamic chart upload failed for '{name}': {_e}")
684
+ if uploaded_any:
685
+ logger.info("Uploaded dynamic chart artifacts and recorded in KV store")
686
+
687
+ def _upload_single_chart_artifacts(self, adapter: Any, run_id: str, name: str, chart_out: Path, chart_type: str | None = None) -> None:
688
+ """Upload PNG artifacts for a single chart and record them in KV.
689
+
690
+ chart_type: optional "static" or "dynamic" for UI differentiation.
691
+ """
692
+ logger = logging.getLogger(__name__)
693
+ ssm = getattr(adapter, 'step_state_manager', None)
694
+ obj_store = getattr(ssm, 'object_store', None) if ssm else None
695
+ kv = getattr(ssm, 'kv_store', None) if ssm else None
696
+ try:
697
+ project_ns = os.environ.get('MLOPS_PROJECT_ID', '')
698
+ except Exception:
699
+ project_ns = ''
700
+ # If bucket known, build absolute gs:// directory prefix for improved UX
701
+ abs_charts_root = None
702
+ try:
703
+ if obj_store and hasattr(obj_store, '_bucket') and getattr(obj_store, '_bucket') is not None:
704
+ bname = getattr(getattr(obj_store, '_bucket'), 'name', None)
705
+ if bname:
706
+ abs_charts_root = f"gs://{bname}/projects/{project_ns}/charts/{run_id}"
707
+ except Exception:
708
+ abs_charts_root = None
709
+ import time as _time
710
+ artifacts: list[dict] = []
711
+ # Capture PNGs recursively to support nested structures under chart output
712
+ for p in chart_out.rglob("*.png"):
713
+ obj_path = None
714
+ # Always capture local cache path so UI can fetch from server if object store is unavailable
715
+ try:
716
+ local_path = str(p.resolve())
717
+ except Exception:
718
+ local_path = str(p)
719
+ if obj_store:
720
+ try:
721
+ base = f"projects/{project_ns}/charts/{run_id}/{name}"
722
+ if abs_charts_root:
723
+ base = f"{abs_charts_root}/{name}"
724
+ remote = obj_store.build_uri(base, p.name)
725
+ with open(p, 'rb') as f:
726
+ obj_store.put_bytes(remote, f.read(), content_type="image/png")
727
+ obj_path = remote
728
+ except Exception as _ue:
729
+ logger.warning(f"Upload failed for chart '{name}' file {p.name}: {_ue}")
730
+ if not obj_path:
731
+ # Fallback to local cache path if no remote object path
732
+ obj_path = local_path
733
+ try:
734
+ artifacts.append({
735
+ "title": p.name,
736
+ "object_path": obj_path,
737
+ "cache_path": local_path,
738
+ "mime_type": "image/png",
739
+ "size_bytes": p.stat().st_size,
740
+ "created_at": _time.time(),
741
+ "chart_type": (chart_type or "static"),
742
+ })
743
+ except Exception:
744
+ pass
745
+ # Record artifacts in KV store for UI listing
746
+ try:
747
+ if kv and hasattr(kv, 'record_run_chart_artifacts'):
748
+ kv.record_run_chart_artifacts(run_id, name, artifacts)
749
+ except Exception:
750
+ pass
751
+
752
+
753
+ # -------------------- Cloud bootstrap helpers --------------------
754
+
755
+ def _configure_logging(self, config: Dict[str, Any], project_path: Path) -> None:
756
+ """Configure Python logging system based on the config.
757
+
758
+ Honors MLOPS_RUN_LOG_FILE env var to force a unique per-run log file.
759
+ """
760
+ # Prefer explicit per-run log path if it is timestamped; otherwise, create a timestamped file.
761
+ env_log = os.environ.get("MLOPS_RUN_LOG_FILE")
762
+ chosen_log_file: str
763
+ if env_log:
764
+ try:
765
+ name = Path(env_log).name
766
+ pattern = rf"^{re.escape(project_path.name)}_\d{{8}}_\d{{6}}\.log$"
767
+ if re.match(pattern, name):
768
+ chosen_log_file = env_log
769
+ else:
770
+ raise ValueError("Non-timestamped log path provided; overriding with timestamped path")
771
+ except Exception:
772
+ ts = datetime.now().strftime("%Y%m%d_%H%M%S")
773
+ chosen_log_file = str(project_path / "logs" / f"{project_path.name}_{ts}.log")
774
+ self._set_env_var("MLOPS_RUN_LOG_FILE", chosen_log_file)
775
+ else:
776
+ ts = datetime.now().strftime("%Y%m%d_%H%M%S")
777
+ chosen_log_file = str(project_path / "logs" / f"{project_path.name}_{ts}.log")
778
+ self._set_env_var("MLOPS_RUN_LOG_FILE", chosen_log_file)
779
+ log_path = Path(chosen_log_file)
780
+ log_path.parent.mkdir(parents=True, exist_ok=True)
781
+
782
+ handlers = [logging.FileHandler(str(log_path), encoding="utf-8")]
783
+ logging.basicConfig(
784
+ level=logging.INFO,
785
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
786
+ handlers=handlers,
787
+ force=True
788
+ )
789
+
790
+ # Redirect stdout/stderr prints to the logging system so nothing goes to the terminal
791
+ class _StreamToLogger:
792
+ def __init__(self, logger: logging.Logger, level: int):
793
+ self.logger = logger
794
+ self.level = level
795
+ self._buffer = ""
796
+
797
+ def write(self, message: str) -> None:
798
+ if not message:
799
+ return
800
+ self._buffer += message
801
+ while "\n" in self._buffer:
802
+ line, self._buffer = self._buffer.split("\n", 1)
803
+ if line.strip():
804
+ self.logger.log(self.level, line)
805
+
806
+ def flush(self) -> None:
807
+ if self._buffer.strip():
808
+ self.logger.log(self.level, self._buffer.strip())
809
+ self._buffer = ""
810
+
811
+ root_logger = logging.getLogger()
812
+ sys.stdout = _StreamToLogger(root_logger, logging.INFO)
813
+ sys.stderr = _StreamToLogger(root_logger, logging.ERROR)
814
+
815
+ def run_pipeline_for_project(self, project_id: str, config_path: str) -> Dict[str, Any]:
816
+ """
817
+ Run the ML pipeline for a specific project.
818
+
819
+ Args:
820
+ project_id: The project identifier
821
+ config_path: Path to the project's configuration file
822
+
823
+ Returns:
824
+ Pipeline execution results
825
+ """
826
+ project_manager = ProjectManager()
827
+
828
+ if not project_manager.project_exists(project_id):
829
+ raise ValueError(f"Project '{project_id}' does not exist")
830
+
831
+ project_path = project_manager.get_project_path(project_id)
832
+
833
+ with open(config_path) as f:
834
+ platform_config = yaml.safe_load(f)
835
+
836
+ self._update_config_for_project(platform_config, project_path, project_id)
837
+
838
+ # Export project id so reporting and upload paths use the correct namespace.
839
+ self._set_env_var("MLOPS_PROJECT_ID", str(project_id))
840
+
841
+ self._configure_logging(platform_config, project_path)
842
+ logger = logging.getLogger(__name__)
843
+
844
+ tracker = self._initialize_tracker(platform_config)
845
+ repro_manager = ReproducibilityManager(config_path, tracker_instance=tracker, project_path=project_path)
846
+
847
+ repro_manager.ensure_reproducibility_setup()
848
+ # Ensure both runtime and reporting environments are initialized so their interpreters are available
849
+ try:
850
+ repro_manager.setup_environment()
851
+ except Exception as _env_e:
852
+ logger.warning(f"Environment setup skipped or failed: {_env_e}")
853
+
854
+ # Export the project runtime and reporting interpreters so auto-installs go into the venvs
855
+ try:
856
+ if getattr(repro_manager, 'python_interpreter', None):
857
+ self._set_env_var("MLOPS_RUNTIME_PYTHON", str(repro_manager.python_interpreter))
858
+ except Exception:
859
+ pass
860
+ try:
861
+ rep_py = getattr(repro_manager, "reporting_python_interpreter", None)
862
+ if rep_py:
863
+ self._set_env_var("MLOPS_REPORTING_PYTHON", str(rep_py))
864
+ except Exception:
865
+ pass
866
+
867
+ # Export reporting config for workers (charts metadata and entrypoints)
868
+ try:
869
+ rep_cfg_in = (platform_config or {}).get('reporting', {}) or {}
870
+ rep_cfg: dict = rep_cfg_in if isinstance(rep_cfg_in, dict) else {}
871
+ # Include interpreter hints so chart runners can reliably select the reporting env
872
+ # even when env propagation is imperfect (e.g., distributed workers).
873
+ try:
874
+ if getattr(repro_manager, "python_interpreter", None):
875
+ rep_cfg.setdefault("runtime_python", str(repro_manager.python_interpreter))
876
+ rep_py = getattr(repro_manager, "reporting_python_interpreter", None)
877
+ if rep_py:
878
+ rep_cfg.setdefault("reporting_python", str(rep_py))
879
+ except Exception:
880
+ pass
881
+ self._set_env_var("MLOPS_REPORTING_CONFIG", json.dumps(rep_cfg))
882
+ except Exception:
883
+ pass
884
+
885
+ model_section = platform_config.get("model", {})
886
+ try:
887
+ repro_manager.apply_cloud_env_from_config(model_section)
888
+ repro_manager.ensure_cloud_dependencies(model_section)
889
+ except Exception as _cloud_e:
890
+ logger.warning(f"Cloud bootstrap skipped or failed: {_cloud_e}")
891
+
892
+
893
+ run_id = self._generate_run_id(platform_config, project_id)
894
+ run_metadata = self._prepare_run_metadata(platform_config, run_id)
895
+
896
+ # Keep this exact substring ("with run_id:") stable; cluster controller parses it.
897
+ print(f"Executing project '{project_id}' with run_id: {run_id}")
898
+
899
+ run_started = False
900
+ final_status = "FINISHED"
901
+
902
+ try:
903
+ tracker.start_run(**run_metadata)
904
+ run_started = True
905
+ model_config = AdapterConfig(**platform_config["model"])
906
+
907
+ # Build a typed run context so adapters/executors can avoid relying on implicit env vars.
908
+ try:
909
+ from mlops.runtime.context import RunContext
910
+ workspace_root = get_workspace_root()
911
+ cache_cfg = ((platform_config.get("model") or {}).get("parameters") or {}).get("cache") or {}
912
+ cache_cfg = cache_cfg if isinstance(cache_cfg, dict) else {}
913
+ backend_cfg = cache_cfg.get("backend") or {}
914
+ backend_cfg = backend_cfg if isinstance(backend_cfg, dict) else {}
915
+ reporting_cfg = (platform_config.get("reporting") or {}) if isinstance(platform_config.get("reporting"), dict) else {}
916
+ run_context = RunContext(
917
+ workspace_root=workspace_root,
918
+ project_id=str(project_id),
919
+ project_root=project_path,
920
+ run_id=str(run_id),
921
+ runtime_python=getattr(repro_manager, "python_interpreter", None),
922
+ reporting_python=getattr(repro_manager, "reporting_python_interpreter", None),
923
+ cache_backend=dict(backend_cfg),
924
+ cache_config=dict(cache_cfg),
925
+ reporting_config=dict(reporting_cfg),
926
+ )
927
+ except Exception:
928
+ run_context = None
929
+
930
+ adapter = self.adapter_manager.create_adapter(
931
+ platform_config["model"]["framework"],
932
+ model_config,
933
+ python_interpreter=repro_manager.python_interpreter,
934
+ environment_name=repro_manager.environment_name,
935
+ project_path=project_manager.get_project_path(project_id),
936
+ run_context=run_context,
937
+ )
938
+
939
+ if adapter is None:
940
+ raise ValueError(f"Could not create adapter for framework: {platform_config['model']['framework']}")
941
+
942
+ adapter.initialize()
943
+ try:
944
+ if hasattr(adapter, "set_tracker"):
945
+ adapter.set_tracker(tracker)
946
+ except Exception:
947
+ pass
948
+
949
+ pipeline_results = self._execute_pipeline(adapter, platform_config, run_id, tracker)
950
+
951
+ saved_artifact_paths = repro_manager.save_run_artifacts_locally(run_id, adapter)
952
+ pipeline_results["artifact_paths"] = saved_artifact_paths
953
+
954
+ config_hash = self._compute_config_hash(platform_config)
955
+ project_manager.add_run_to_project(project_id, run_id, config_hash)
956
+
957
+ print(f"Project '{project_id}' pipeline execution completed successfully!")
958
+ return pipeline_results
959
+
960
+ except Exception as e:
961
+ final_status = "FAILED"
962
+ print(f"Pipeline execution failed: {e}")
963
+ raise
964
+ finally:
965
+ if run_started:
966
+ try:
967
+ if hasattr(tracker, "run_active"):
968
+ if getattr(tracker, "run_active"):
969
+ tracker.end_run(status=final_status)
970
+ else:
971
+ tracker.end_run(status=final_status)
972
+ except Exception:
973
+ pass
974
+
975
+ def _update_config_for_project(self, config: Dict[str, Any], project_path: Path, project_id: str) -> None:
976
+ """Update configuration paths to be project-specific."""
977
+ repro = config.get("reproducibility")
978
+ if isinstance(repro, dict):
979
+ artifacts_config = repro.get("artifacts")
980
+ if isinstance(artifacts_config, dict):
981
+ model_cfg = artifacts_config.get("model")
982
+ if isinstance(model_cfg, dict):
983
+ model_cfg["path"] = str(project_path / "artifacts" / "models")
984
+ data_cfg = artifacts_config.get("data")
985
+ if isinstance(data_cfg, dict):
986
+ data_cfg["path"] = str(project_path / "artifacts" / "data")
987
+
988
+ tracking_config = repro.get("experiment_tracking")
989
+ if isinstance(tracking_config, dict):
990
+ params = tracking_config.get("parameters")
991
+ if isinstance(params, dict):
992
+ tracking_uri = params.get("tracking_uri")
993
+ if isinstance(tracking_uri, str) and "sqlite" in tracking_uri:
994
+ params["tracking_uri"] = f"sqlite:///{project_path}/artifacts/experiments.db"
995
+
996
+