expops 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. expops-0.1.3.dist-info/METADATA +826 -0
  2. expops-0.1.3.dist-info/RECORD +86 -0
  3. expops-0.1.3.dist-info/WHEEL +5 -0
  4. expops-0.1.3.dist-info/entry_points.txt +3 -0
  5. expops-0.1.3.dist-info/licenses/LICENSE +674 -0
  6. expops-0.1.3.dist-info/top_level.txt +1 -0
  7. mlops/__init__.py +0 -0
  8. mlops/__main__.py +11 -0
  9. mlops/_version.py +34 -0
  10. mlops/adapters/__init__.py +12 -0
  11. mlops/adapters/base.py +86 -0
  12. mlops/adapters/config_schema.py +89 -0
  13. mlops/adapters/custom/__init__.py +3 -0
  14. mlops/adapters/custom/custom_adapter.py +447 -0
  15. mlops/adapters/plugin_manager.py +113 -0
  16. mlops/adapters/sklearn/__init__.py +3 -0
  17. mlops/adapters/sklearn/adapter.py +94 -0
  18. mlops/cluster/__init__.py +3 -0
  19. mlops/cluster/controller.py +496 -0
  20. mlops/cluster/process_runner.py +91 -0
  21. mlops/cluster/providers.py +258 -0
  22. mlops/core/__init__.py +95 -0
  23. mlops/core/custom_model_base.py +38 -0
  24. mlops/core/dask_networkx_executor.py +1265 -0
  25. mlops/core/executor_worker.py +1239 -0
  26. mlops/core/experiment_tracker.py +81 -0
  27. mlops/core/graph_types.py +64 -0
  28. mlops/core/networkx_parser.py +135 -0
  29. mlops/core/payload_spill.py +278 -0
  30. mlops/core/pipeline_utils.py +162 -0
  31. mlops/core/process_hashing.py +216 -0
  32. mlops/core/step_state_manager.py +1298 -0
  33. mlops/core/step_system.py +956 -0
  34. mlops/core/workspace.py +99 -0
  35. mlops/environment/__init__.py +10 -0
  36. mlops/environment/base.py +43 -0
  37. mlops/environment/conda_manager.py +307 -0
  38. mlops/environment/factory.py +70 -0
  39. mlops/environment/pyenv_manager.py +146 -0
  40. mlops/environment/setup_env.py +31 -0
  41. mlops/environment/system_manager.py +66 -0
  42. mlops/environment/utils.py +105 -0
  43. mlops/environment/venv_manager.py +134 -0
  44. mlops/main.py +527 -0
  45. mlops/managers/project_manager.py +400 -0
  46. mlops/managers/reproducibility_manager.py +575 -0
  47. mlops/platform.py +996 -0
  48. mlops/reporting/__init__.py +16 -0
  49. mlops/reporting/context.py +187 -0
  50. mlops/reporting/entrypoint.py +292 -0
  51. mlops/reporting/kv_utils.py +77 -0
  52. mlops/reporting/registry.py +50 -0
  53. mlops/runtime/__init__.py +9 -0
  54. mlops/runtime/context.py +34 -0
  55. mlops/runtime/env_export.py +113 -0
  56. mlops/storage/__init__.py +12 -0
  57. mlops/storage/adapters/__init__.py +9 -0
  58. mlops/storage/adapters/gcp_kv_store.py +778 -0
  59. mlops/storage/adapters/gcs_object_store.py +96 -0
  60. mlops/storage/adapters/memory_store.py +240 -0
  61. mlops/storage/adapters/redis_store.py +438 -0
  62. mlops/storage/factory.py +199 -0
  63. mlops/storage/interfaces/__init__.py +6 -0
  64. mlops/storage/interfaces/kv_store.py +118 -0
  65. mlops/storage/path_utils.py +38 -0
  66. mlops/templates/premier-league/charts/plot_metrics.js +70 -0
  67. mlops/templates/premier-league/charts/plot_metrics.py +145 -0
  68. mlops/templates/premier-league/charts/requirements.txt +6 -0
  69. mlops/templates/premier-league/configs/cluster_config.yaml +13 -0
  70. mlops/templates/premier-league/configs/project_config.yaml +207 -0
  71. mlops/templates/premier-league/data/England CSV.csv +12154 -0
  72. mlops/templates/premier-league/models/premier_league_model.py +638 -0
  73. mlops/templates/premier-league/requirements.txt +8 -0
  74. mlops/templates/sklearn-basic/README.md +22 -0
  75. mlops/templates/sklearn-basic/charts/plot_metrics.py +85 -0
  76. mlops/templates/sklearn-basic/charts/requirements.txt +3 -0
  77. mlops/templates/sklearn-basic/configs/project_config.yaml +64 -0
  78. mlops/templates/sklearn-basic/data/train.csv +14 -0
  79. mlops/templates/sklearn-basic/models/model.py +62 -0
  80. mlops/templates/sklearn-basic/requirements.txt +10 -0
  81. mlops/web/__init__.py +3 -0
  82. mlops/web/server.py +585 -0
  83. mlops/web/ui/index.html +52 -0
  84. mlops/web/ui/mlops-charts.js +357 -0
  85. mlops/web/ui/script.js +1244 -0
  86. mlops/web/ui/styles.css +248 -0
@@ -0,0 +1,258 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Optional, Tuple, Any, Dict
4
+ import logging
5
+ import sys
6
+ import os
7
+ from pathlib import Path
8
+
9
+ from mlops.core.workspace import get_workspace_root, infer_source_root
10
+
11
+
12
+ class ClusterProvider:
13
+ """Abstract interface for provisioning a Dask distributed cluster.
14
+
15
+ Implementations should provision a scheduler and workers on the target
16
+ infrastructure and return a connected dask.distributed.Client (or None)
17
+ and the scheduler address string.
18
+ """
19
+
20
+ def __init__(self, logger: Optional[logging.Logger] = None) -> None:
21
+ self.logger = logger or logging.getLogger(self.__class__.__name__)
22
+
23
+ def start(self, num_workers: int, options: Optional[Dict[str, Any]] = None) -> Tuple[Optional[Any], Optional[str]]:
24
+ raise NotImplementedError
25
+
26
+ def stop(self) -> None:
27
+ raise NotImplementedError
28
+
29
+
30
+ class SlurmClusterProvider(ClusterProvider):
31
+ """Provision a Dask cluster on SLURM using dask-jobqueue's SLURMCluster.
32
+
33
+ Note: This provider requires the optional dependency 'dask-jobqueue'.
34
+ """
35
+
36
+ def __init__(self, logger: Optional[logging.Logger] = None) -> None:
37
+ super().__init__(logger)
38
+ self._cluster = None
39
+ self._client = None
40
+
41
+ def start(self, num_workers: int, options: Optional[Dict[str, Any]] = None) -> Tuple[Optional[Any], Optional[str]]:
42
+ options = options or {}
43
+ try:
44
+ # Prefer importing Client from the 'distributed' package to avoid
45
+ # reliance on the 'dask' namespace being present. Fallback to
46
+ # 'dask.distributed' for older setups.
47
+ try:
48
+ from distributed import Client
49
+ except Exception:
50
+ from dask.distributed import Client
51
+ from dask_jobqueue import SLURMCluster
52
+ except Exception as e:
53
+ # If dask-jobqueue or dask import fails, fall back to a local
54
+ # in-process distributed cluster so execution can proceed.
55
+ self.logger.error(f"SLURM provider unavailable (missing deps?): {e}")
56
+ try:
57
+ try:
58
+ from distributed import Client, LocalCluster # type: ignore
59
+ except Exception:
60
+ from dask.distributed import Client, LocalCluster # type: ignore
61
+ self._cluster = LocalCluster(n_workers=max(1, int(options.get('worker_processes', 1) * num_workers)),
62
+ threads_per_worker=int(options.get('worker_cores', 1)))
63
+ self._client = Client(self._cluster)
64
+ addr = getattr(self._cluster, 'scheduler_address', None) or getattr(self._client.scheduler, 'address', None)
65
+ self.logger.warning(f"Falling back to LocalCluster at {addr} (threads_per_worker={int(options.get('worker_cores', 1))}, n_workers={max(1, int(options.get('worker_processes', 1) * num_workers))})")
66
+ return self._client, addr
67
+ except Exception as e2:
68
+ self.logger.error(f"Failed to start LocalCluster fallback: {e2}")
69
+ return None, None
70
+
71
+ worker_cores = int(options.get('worker_cores', 1))
72
+ worker_memory = options.get('worker_memory', '2GB')
73
+ worker_processes = int(options.get('worker_processes', 1))
74
+ queue = options.get('queue')
75
+ walltime = options.get('walltime', '00:30:00')
76
+ # Optional: additional sbatch directives passed through to SLURMCluster
77
+ # Accept both 'job_extra' and legacy 'job_extra_directives'
78
+ job_extra = options.get('job_extra') or options.get('job_extra_directives') or []
79
+ if isinstance(job_extra, str):
80
+ job_extra = [job_extra]
81
+
82
+ # Convenience option: when True, ensure each worker is placed on a distinct node
83
+ # by requesting node-level exclusivity for each worker job.
84
+ spread_workers = bool(options.get('spread_workers_across_nodes', False))
85
+ if spread_workers and not any(str(opt).startswith('--exclusive') for opt in job_extra):
86
+ job_extra.append('--exclusive')
87
+
88
+ workspace_root = get_workspace_root()
89
+ source_root = infer_source_root()
90
+
91
+ # Source-checkout support: allow workers to import from <repo>/src on shared filesystems.
92
+ # For installed packages, this is typically unnecessary and <workspace>/src will not exist.
93
+ src_dir = None
94
+ try:
95
+ if source_root and (source_root / "src").exists():
96
+ src_dir = (source_root / "src")
97
+ elif (workspace_root / "src").exists():
98
+ src_dir = (workspace_root / "src")
99
+ except Exception:
100
+ src_dir = None
101
+
102
+ # Allow users to pass custom prologue; map legacy env_extra to job_script_prologue to avoid warnings
103
+ job_script_prologue = []
104
+ if options.get('job_script_prologue'):
105
+ pro = options.get('job_script_prologue')
106
+ job_script_prologue = pro if isinstance(pro, list) else [str(pro)]
107
+ elif options.get('env_extra'):
108
+ pro = options.get('env_extra')
109
+ job_script_prologue = pro if isinstance(pro, list) else [str(pro)]
110
+
111
+ # Ensure workers use the same Python interpreter and can import our code
112
+ # Also force a consistent comm compression across client/scheduler/workers
113
+ # to avoid codec mismatches that can break task-graph deserialization.
114
+ requested_compression = (
115
+ options.get('comm_compression')
116
+ or options.get('compression')
117
+ or os.environ.get('DASK_DISTRIBUTED__COMM__COMPRESSION')
118
+ or 'zlib'
119
+ )
120
+ compression_value = str(requested_compression)
121
+ os.environ.setdefault('DASK_DISTRIBUTED__COMM__COMPRESSION', compression_value)
122
+ job_script_prologue = job_script_prologue + [
123
+ # Always export workspace so workers can find projects/ regardless of CWD.
124
+ f'export MLOPS_WORKSPACE_DIR="{workspace_root}"',
125
+ f'export DASK_DISTRIBUTED__COMM__COMPRESSION="{compression_value}"',
126
+ ]
127
+ if src_dir:
128
+ job_script_prologue.append(f'export PYTHONPATH="{src_dir}:${{PYTHONPATH:-}}"')
129
+
130
+ def _build_kwargs_base() -> Dict[str, Any]:
131
+ base = dict(
132
+ cores=worker_cores,
133
+ memory=worker_memory,
134
+ processes=worker_processes,
135
+ queue=queue,
136
+ walltime=walltime,
137
+ python=sys.executable,
138
+ job_script_prologue=job_script_prologue,
139
+ )
140
+ # Allow arbitrary SLURMCluster kwargs via 'cluster_kwargs'
141
+ base.update(options.get('cluster_kwargs') or {})
142
+ return base
143
+
144
+ def _create_cluster(extra_directives: list[str]):
145
+ # Prefer the new parameter name to avoid FutureWarning; fallback if unsupported
146
+ base = _build_kwargs_base()
147
+ try:
148
+ # Newer dask-jobqueue
149
+ base_new = dict(base)
150
+ base_new['job_extra_directives'] = extra_directives
151
+ return SLURMCluster(**base_new)
152
+ except TypeError:
153
+ # Older dask-jobqueue
154
+ base_old = dict(base)
155
+ base_old['job_extra'] = extra_directives
156
+ return SLURMCluster(**base_old)
157
+
158
+ # First attempt with requested directives
159
+ self._cluster = _create_cluster(job_extra)
160
+ self._cluster.scale(num_workers)
161
+ self._client = Client(self._cluster)
162
+ address: Optional[str]
163
+ try:
164
+ address = self._client.scheduler.address
165
+ except Exception:
166
+ address = None
167
+
168
+ # Wait briefly for at least one worker; if none and we added exclusivity, retry without it
169
+ try:
170
+ if num_workers > 0:
171
+ # 60s should be enough for sbatch to accept or reject worker jobs
172
+ self._client.wait_for_workers(min(1, num_workers), timeout=60)
173
+ except Exception:
174
+ # If spread requested, remove exclusivity and retry once
175
+ if spread_workers and any(str(opt).startswith('--exclusive') for opt in job_extra):
176
+ self.logger.warning("SLURM exclusive allocation not permitted or workers failed to start; retrying without --exclusive")
177
+ try:
178
+ # Tear down previous cluster before retrying
179
+ self._client.close()
180
+ except Exception:
181
+ pass
182
+ try:
183
+ self._cluster.close()
184
+ except Exception:
185
+ pass
186
+ # Rebuild without exclusive
187
+ filtered = [opt for opt in job_extra if not str(opt).startswith('--exclusive')]
188
+ self._cluster = _create_cluster(filtered)
189
+ self._cluster.scale(num_workers)
190
+ self._client = Client(self._cluster)
191
+ try:
192
+ address = self._client.scheduler.address
193
+ except Exception:
194
+ address = None
195
+ # Don't raise if workers still take long; proceed and let Dask run degrade gracefully
196
+ else:
197
+ self.logger.warning("Workers failed to start within timeout; proceeding anyway")
198
+
199
+ self.logger.info(
200
+ f"Started SLURMCluster: workers={num_workers}, cores/worker={worker_cores}, mem/worker={worker_memory}"
201
+ )
202
+ return self._client, address
203
+
204
+ def stop(self) -> None:
205
+ try:
206
+ if self._client is not None:
207
+ self._client.close()
208
+ finally:
209
+ self._client = None
210
+ if self._cluster is not None:
211
+ try:
212
+ self._cluster.close()
213
+ finally:
214
+ self._cluster = None
215
+
216
+
217
+
218
+ class AnsibleClusterProvider(ClusterProvider):
219
+ """Provision a Dask cluster on a set of hosts managed via Ansible or SSH.
220
+
221
+ This is a minimal stub that expects an address to be provided via options
222
+ or environment variables and does not itself run Ansible. In a full
223
+ implementation, this class would orchestrate scheduler/worker processes
224
+ across inventory hosts and return a connected Client.
225
+ """
226
+
227
+ def __init__(self, logger: Optional[logging.Logger] = None) -> None:
228
+ super().__init__(logger)
229
+ self._client = None
230
+
231
+ def start(self, num_workers: int, options: Optional[Dict[str, Any]] = None) -> Tuple[Optional[Any], Optional[str]]:
232
+ options = options or {}
233
+ scheduler_address = options.get('scheduler_address')
234
+ if not scheduler_address:
235
+ # Try env var
236
+ import os
237
+ scheduler_address = os.environ.get('DASK_SCHEDULER_ADDRESS')
238
+ if not scheduler_address:
239
+ self.logger.error("AnsibleClusterProvider requires 'scheduler_address' in options or DASK_SCHEDULER_ADDRESS env var")
240
+ return None, None
241
+ try:
242
+ try:
243
+ from distributed import Client
244
+ except Exception:
245
+ from dask.distributed import Client
246
+ self._client = Client(scheduler_address)
247
+ self.logger.info(f"Connected to existing Dask scheduler at {scheduler_address}")
248
+ return self._client, scheduler_address
249
+ except Exception as e:
250
+ self.logger.error(f"Failed to connect to scheduler at {scheduler_address}: {e}")
251
+ return None, None
252
+
253
+ def stop(self) -> None:
254
+ if self._client is not None:
255
+ try:
256
+ self._client.close()
257
+ finally:
258
+ self._client = None
mlops/core/__init__.py ADDED
@@ -0,0 +1,95 @@
1
+ """
2
+ MLOps Core Module (lazy-loading)
3
+
4
+ Provides the core components for the NetworkX-based pipeline execution system.
5
+ Heavy submodules are imported lazily on attribute access to minimize required
6
+ runtime dependencies for lightweight utilities (e.g., pipeline_utils).
7
+ """
8
+
9
+ from typing import Any
10
+ import importlib
11
+
12
+ __all__ = [
13
+ # step_system exports
14
+ "step",
15
+ "process",
16
+ "StepContext",
17
+ "StepContextFactory",
18
+ "StepDefinition",
19
+ "StepRegistry",
20
+ "ProcessDefinition",
21
+ "ProcessRegistry",
22
+ "get_step_registry",
23
+ "get_process_registry",
24
+ "get_current_context",
25
+ "set_current_context",
26
+ "get_context_factory",
27
+ "set_current_process_context",
28
+ "get_current_process_context",
29
+ "get_parameter_resolver",
30
+ "set_state_manager",
31
+ "get_state_manager",
32
+ "log_metric",
33
+ "SerializableData",
34
+ "ModelData",
35
+ # custom model
36
+ "MLOpsCustomModelBase",
37
+ # graph types + parser
38
+ "NetworkXGraphConfig",
39
+ "ProcessConfig",
40
+ "StepConfig",
41
+ "ExecutionResult",
42
+ "NodeType",
43
+ "NetworkXPipelineParser",
44
+ "parse_networkx_pipeline_from_config",
45
+ # state manager
46
+ "StepStateManager",
47
+ ]
48
+
49
+ _lazy_attr_to_module = {
50
+ "step": ("mlops.core.step_system", "step"),
51
+ "process": ("mlops.core.step_system", "process"),
52
+ "StepContext": ("mlops.core.step_system", "StepContext"),
53
+ "StepContextFactory": ("mlops.core.step_system", "StepContextFactory"),
54
+ "StepDefinition": ("mlops.core.step_system", "StepDefinition"),
55
+ "StepRegistry": ("mlops.core.step_system", "StepRegistry"),
56
+ "ProcessDefinition": ("mlops.core.step_system", "ProcessDefinition"),
57
+ "ProcessRegistry": ("mlops.core.step_system", "ProcessRegistry"),
58
+ "get_step_registry": ("mlops.core.step_system", "get_step_registry"),
59
+ "get_process_registry": ("mlops.core.step_system", "get_process_registry"),
60
+ "get_current_context": ("mlops.core.step_system", "get_current_context"),
61
+ "set_current_context": ("mlops.core.step_system", "set_current_context"),
62
+ "get_context_factory": ("mlops.core.step_system", "get_context_factory"),
63
+ "set_current_process_context": ("mlops.core.step_system", "set_current_process_context"),
64
+ "get_current_process_context": ("mlops.core.step_system", "get_current_process_context"),
65
+ "get_parameter_resolver": ("mlops.core.step_system", "get_parameter_resolver"),
66
+ "set_state_manager": ("mlops.core.step_system", "set_state_manager"),
67
+ "get_state_manager": ("mlops.core.step_system", "get_state_manager"),
68
+ "log_metric": ("mlops.core.step_system", "log_metric"),
69
+ "SerializableData": ("mlops.core.step_system", "SerializableData"),
70
+ "ModelData": ("mlops.core.step_system", "ModelData"),
71
+ # custom model base
72
+ "MLOpsCustomModelBase": ("mlops.core.custom_model_base", "MLOpsCustomModelBase"),
73
+ # graph types + parser
74
+ "NetworkXGraphConfig": ("mlops.core.graph_types", "NetworkXGraphConfig"),
75
+ "ProcessConfig": ("mlops.core.graph_types", "ProcessConfig"),
76
+ "StepConfig": ("mlops.core.graph_types", "StepConfig"),
77
+ "ExecutionResult": ("mlops.core.graph_types", "ExecutionResult"),
78
+ "NodeType": ("mlops.core.graph_types", "NodeType"),
79
+ "NetworkXPipelineParser": ("mlops.core.networkx_parser", "NetworkXPipelineParser"),
80
+ "parse_networkx_pipeline_from_config": ("mlops.core.networkx_parser", "parse_networkx_pipeline_from_config"),
81
+ # state manager
82
+ "StepStateManager": ("mlops.core.step_state_manager", "StepStateManager"),
83
+ }
84
+
85
+
86
+ def __getattr__(name: str) -> Any:
87
+ if name in _lazy_attr_to_module:
88
+ module_name, attr_name = _lazy_attr_to_module[name]
89
+ module = importlib.import_module(module_name)
90
+ return getattr(module, attr_name)
91
+ raise AttributeError(f"module 'mlops.core' has no attribute '{name}'")
92
+
93
+
94
+ def __dir__() -> list[str]:
95
+ return sorted(list(globals().keys()) + __all__)
@@ -0,0 +1,38 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Optional
4
+
5
+
6
+ class MLOpsCustomModelBase:
7
+ """Lightweight base class for user-defined models.
8
+
9
+ This class is intentionally minimal: it stores hyperparameters and (when not
10
+ explicitly provided) tries to resolve process-scoped hyperparameters from the
11
+ active `StepContext`.
12
+ """
13
+
14
+ def __init__(self, hyperparameters: Optional[dict[str, Any]] = None) -> None:
15
+ """Initialize with hyperparameters.
16
+
17
+ If not provided, automatically resolve merged hyperparameters from the
18
+ active step context for the current process (global overrides -> process overrides).
19
+ """
20
+ if hyperparameters and isinstance(hyperparameters, dict):
21
+ self.hyperparameters = hyperparameters
22
+ return
23
+ try:
24
+ from .step_system import get_current_context
25
+ ctx = get_current_context()
26
+ if ctx and hasattr(ctx, 'get_hyperparameters'):
27
+ proc = getattr(ctx, 'current_process', None)
28
+ resolved = ctx.get_hyperparameters(proc)
29
+ self.hyperparameters = resolved if isinstance(resolved, dict) else {}
30
+ else:
31
+ self.hyperparameters = {}
32
+ except Exception:
33
+ self.hyperparameters = {}
34
+
35
+ def get_step_registry(self) -> Any:
36
+ """Get the step registry containing all @step decorated functions."""
37
+ from .step_system import get_step_registry
38
+ return get_step_registry()