PyPI - expops - Versions diffs - 0.1.3__py3-none-any.whl - Mend

expops 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

expops-0.1.3.dist-info/METADATA +826 -0
expops-0.1.3.dist-info/RECORD +86 -0
expops-0.1.3.dist-info/WHEEL +5 -0
expops-0.1.3.dist-info/entry_points.txt +3 -0
expops-0.1.3.dist-info/licenses/LICENSE +674 -0
expops-0.1.3.dist-info/top_level.txt +1 -0
mlops/__init__.py +0 -0
mlops/__main__.py +11 -0
mlops/_version.py +34 -0
mlops/adapters/__init__.py +12 -0
mlops/adapters/base.py +86 -0
mlops/adapters/config_schema.py +89 -0
mlops/adapters/custom/__init__.py +3 -0
mlops/adapters/custom/custom_adapter.py +447 -0
mlops/adapters/plugin_manager.py +113 -0
mlops/adapters/sklearn/__init__.py +3 -0
mlops/adapters/sklearn/adapter.py +94 -0
mlops/cluster/__init__.py +3 -0
mlops/cluster/controller.py +496 -0
mlops/cluster/process_runner.py +91 -0
mlops/cluster/providers.py +258 -0
mlops/core/__init__.py +95 -0
mlops/core/custom_model_base.py +38 -0
mlops/core/dask_networkx_executor.py +1265 -0
mlops/core/executor_worker.py +1239 -0
mlops/core/experiment_tracker.py +81 -0
mlops/core/graph_types.py +64 -0
mlops/core/networkx_parser.py +135 -0
mlops/core/payload_spill.py +278 -0
mlops/core/pipeline_utils.py +162 -0
mlops/core/process_hashing.py +216 -0
mlops/core/step_state_manager.py +1298 -0
mlops/core/step_system.py +956 -0
mlops/core/workspace.py +99 -0
mlops/environment/__init__.py +10 -0
mlops/environment/base.py +43 -0
mlops/environment/conda_manager.py +307 -0
mlops/environment/factory.py +70 -0
mlops/environment/pyenv_manager.py +146 -0
mlops/environment/setup_env.py +31 -0
mlops/environment/system_manager.py +66 -0
mlops/environment/utils.py +105 -0
mlops/environment/venv_manager.py +134 -0
mlops/main.py +527 -0
mlops/managers/project_manager.py +400 -0
mlops/managers/reproducibility_manager.py +575 -0
mlops/platform.py +996 -0
mlops/reporting/__init__.py +16 -0
mlops/reporting/context.py +187 -0
mlops/reporting/entrypoint.py +292 -0
mlops/reporting/kv_utils.py +77 -0
mlops/reporting/registry.py +50 -0
mlops/runtime/__init__.py +9 -0
mlops/runtime/context.py +34 -0
mlops/runtime/env_export.py +113 -0
mlops/storage/__init__.py +12 -0
mlops/storage/adapters/__init__.py +9 -0
mlops/storage/adapters/gcp_kv_store.py +778 -0
mlops/storage/adapters/gcs_object_store.py +96 -0
mlops/storage/adapters/memory_store.py +240 -0
mlops/storage/adapters/redis_store.py +438 -0
mlops/storage/factory.py +199 -0
mlops/storage/interfaces/__init__.py +6 -0
mlops/storage/interfaces/kv_store.py +118 -0
mlops/storage/path_utils.py +38 -0
mlops/templates/premier-league/charts/plot_metrics.js +70 -0
mlops/templates/premier-league/charts/plot_metrics.py +145 -0
mlops/templates/premier-league/charts/requirements.txt +6 -0
mlops/templates/premier-league/configs/cluster_config.yaml +13 -0
mlops/templates/premier-league/configs/project_config.yaml +207 -0
mlops/templates/premier-league/data/England CSV.csv +12154 -0
mlops/templates/premier-league/models/premier_league_model.py +638 -0
mlops/templates/premier-league/requirements.txt +8 -0
mlops/templates/sklearn-basic/README.md +22 -0
mlops/templates/sklearn-basic/charts/plot_metrics.py +85 -0
mlops/templates/sklearn-basic/charts/requirements.txt +3 -0
mlops/templates/sklearn-basic/configs/project_config.yaml +64 -0
mlops/templates/sklearn-basic/data/train.csv +14 -0
mlops/templates/sklearn-basic/models/model.py +62 -0
mlops/templates/sklearn-basic/requirements.txt +10 -0
mlops/web/__init__.py +3 -0
mlops/web/server.py +585 -0
mlops/web/ui/index.html +52 -0
mlops/web/ui/mlops-charts.js +357 -0
mlops/web/ui/script.js +1244 -0
mlops/web/ui/styles.css +248 -0

mlops/cluster/providers.py ADDED Viewed

@@ -0,0 +1,258 @@
+from __future__ import annotations
+from typing import Optional, Tuple, Any, Dict
+import logging
+import sys
+import os
+from pathlib import Path
+from mlops.core.workspace import get_workspace_root, infer_source_root
+class ClusterProvider:
+    """Abstract interface for provisioning a Dask distributed cluster.
+    Implementations should provision a scheduler and workers on the target
+    infrastructure and return a connected dask.distributed.Client (or None)
+    and the scheduler address string.
+    """
+    def __init__(self, logger: Optional[logging.Logger] = None) -> None:
+        self.logger = logger or logging.getLogger(self.__class__.__name__)
+    def start(self, num_workers: int, options: Optional[Dict[str, Any]] = None) -> Tuple[Optional[Any], Optional[str]]:
+        raise NotImplementedError
+    def stop(self) -> None:
+        raise NotImplementedError
+class SlurmClusterProvider(ClusterProvider):
+    """Provision a Dask cluster on SLURM using dask-jobqueue's SLURMCluster.
+    Note: This provider requires the optional dependency 'dask-jobqueue'.
+    """
+    def __init__(self, logger: Optional[logging.Logger] = None) -> None:
+        super().__init__(logger)
+        self._cluster = None
+        self._client = None
+    def start(self, num_workers: int, options: Optional[Dict[str, Any]] = None) -> Tuple[Optional[Any], Optional[str]]:
+        options = options or {}
+        try:
+            # Prefer importing Client from the 'distributed' package to avoid
+            # reliance on the 'dask' namespace being present. Fallback to
+            # 'dask.distributed' for older setups.
+            try:
+                from distributed import Client
+            except Exception:
+                from dask.distributed import Client
+            from dask_jobqueue import SLURMCluster
+        except Exception as e:
+            # If dask-jobqueue or dask import fails, fall back to a local
+            # in-process distributed cluster so execution can proceed.
+            self.logger.error(f"SLURM provider unavailable (missing deps?): {e}")
+            try:
+                try:
+                    from distributed import Client, LocalCluster  # type: ignore
+                except Exception:
+                    from dask.distributed import Client, LocalCluster  # type: ignore
+                self._cluster = LocalCluster(n_workers=max(1, int(options.get('worker_processes', 1) * num_workers)),
+                                             threads_per_worker=int(options.get('worker_cores', 1)))
+                self._client = Client(self._cluster)
+                addr = getattr(self._cluster, 'scheduler_address', None) or getattr(self._client.scheduler, 'address', None)
+                self.logger.warning(f"Falling back to LocalCluster at {addr} (threads_per_worker={int(options.get('worker_cores', 1))}, n_workers={max(1, int(options.get('worker_processes', 1) * num_workers))})")
+                return self._client, addr
+            except Exception as e2:
+                self.logger.error(f"Failed to start LocalCluster fallback: {e2}")
+                return None, None
+        worker_cores = int(options.get('worker_cores', 1))
+        worker_memory = options.get('worker_memory', '2GB')
+        worker_processes = int(options.get('worker_processes', 1))
+        queue = options.get('queue')
+        walltime = options.get('walltime', '00:30:00')
+        # Optional: additional sbatch directives passed through to SLURMCluster
+        # Accept both 'job_extra' and legacy 'job_extra_directives'
+        job_extra = options.get('job_extra') or options.get('job_extra_directives') or []
+        if isinstance(job_extra, str):
+            job_extra = [job_extra]
+        # Convenience option: when True, ensure each worker is placed on a distinct node
+        # by requesting node-level exclusivity for each worker job.
+        spread_workers = bool(options.get('spread_workers_across_nodes', False))
+        if spread_workers and not any(str(opt).startswith('--exclusive') for opt in job_extra):
+            job_extra.append('--exclusive')
+        workspace_root = get_workspace_root()
+        source_root = infer_source_root()
+        # Source-checkout support: allow workers to import from <repo>/src on shared filesystems.
+        # For installed packages, this is typically unnecessary and <workspace>/src will not exist.
+        src_dir = None
+        try:
+            if source_root and (source_root / "src").exists():
+                src_dir = (source_root / "src")
+            elif (workspace_root / "src").exists():
+                src_dir = (workspace_root / "src")
+        except Exception:
+            src_dir = None
+        # Allow users to pass custom prologue; map legacy env_extra to job_script_prologue to avoid warnings
+        job_script_prologue = []
+        if options.get('job_script_prologue'):
+            pro = options.get('job_script_prologue')
+            job_script_prologue = pro if isinstance(pro, list) else [str(pro)]
+        elif options.get('env_extra'):
+            pro = options.get('env_extra')
+            job_script_prologue = pro if isinstance(pro, list) else [str(pro)]
+        # Ensure workers use the same Python interpreter and can import our code
+        # Also force a consistent comm compression across client/scheduler/workers
+        # to avoid codec mismatches that can break task-graph deserialization.
+        requested_compression = (
+            options.get('comm_compression')
+            or options.get('compression')
+            or os.environ.get('DASK_DISTRIBUTED__COMM__COMPRESSION')
+            or 'zlib'
+        )
+        compression_value = str(requested_compression)
+        os.environ.setdefault('DASK_DISTRIBUTED__COMM__COMPRESSION', compression_value)
+        job_script_prologue = job_script_prologue + [
+            # Always export workspace so workers can find projects/ regardless of CWD.
+            f'export MLOPS_WORKSPACE_DIR="{workspace_root}"',
+            f'export DASK_DISTRIBUTED__COMM__COMPRESSION="{compression_value}"',
+        ]
+        if src_dir:
+            job_script_prologue.append(f'export PYTHONPATH="{src_dir}:${{PYTHONPATH:-}}"')
+        def _build_kwargs_base() -> Dict[str, Any]:
+            base = dict(
+                cores=worker_cores,
+                memory=worker_memory,
+                processes=worker_processes,
+                queue=queue,
+                walltime=walltime,
+                python=sys.executable,
+                job_script_prologue=job_script_prologue,
+            )
+            # Allow arbitrary SLURMCluster kwargs via 'cluster_kwargs'
+            base.update(options.get('cluster_kwargs') or {})
+            return base
+        def _create_cluster(extra_directives: list[str]):
+            # Prefer the new parameter name to avoid FutureWarning; fallback if unsupported
+            base = _build_kwargs_base()
+            try:
+                # Newer dask-jobqueue
+                base_new = dict(base)
+                base_new['job_extra_directives'] = extra_directives
+                return SLURMCluster(**base_new)
+            except TypeError:
+                # Older dask-jobqueue
+                base_old = dict(base)
+                base_old['job_extra'] = extra_directives
+                return SLURMCluster(**base_old)
+        # First attempt with requested directives
+        self._cluster = _create_cluster(job_extra)
+        self._cluster.scale(num_workers)
+        self._client = Client(self._cluster)
+        address: Optional[str]
+        try:
+            address = self._client.scheduler.address
+        except Exception:
+            address = None
+        # Wait briefly for at least one worker; if none and we added exclusivity, retry without it
+        try:
+            if num_workers > 0:
+                # 60s should be enough for sbatch to accept or reject worker jobs
+                self._client.wait_for_workers(min(1, num_workers), timeout=60)
+        except Exception:
+            # If spread requested, remove exclusivity and retry once
+            if spread_workers and any(str(opt).startswith('--exclusive') for opt in job_extra):
+                self.logger.warning("SLURM exclusive allocation not permitted or workers failed to start; retrying without --exclusive")
+                try:
+                    # Tear down previous cluster before retrying
+                    self._client.close()
+                except Exception:
+                    pass
+                try:
+                    self._cluster.close()
+                except Exception:
+                    pass
+                # Rebuild without exclusive
+                filtered = [opt for opt in job_extra if not str(opt).startswith('--exclusive')]
+                self._cluster = _create_cluster(filtered)
+                self._cluster.scale(num_workers)
+                self._client = Client(self._cluster)
+                try:
+                    address = self._client.scheduler.address
+                except Exception:
+                    address = None
+                # Don't raise if workers still take long; proceed and let Dask run degrade gracefully
+            else:
+                self.logger.warning("Workers failed to start within timeout; proceeding anyway")
+        self.logger.info(
+            f"Started SLURMCluster: workers={num_workers}, cores/worker={worker_cores}, mem/worker={worker_memory}"
+        )
+        return self._client, address
+    def stop(self) -> None:
+        try:
+            if self._client is not None:
+                self._client.close()
+        finally:
+            self._client = None
+            if self._cluster is not None:
+                try:
+                    self._cluster.close()
+                finally:
+                    self._cluster = None
+class AnsibleClusterProvider(ClusterProvider):
+    """Provision a Dask cluster on a set of hosts managed via Ansible or SSH.
+    This is a minimal stub that expects an address to be provided via options
+    or environment variables and does not itself run Ansible. In a full
+    implementation, this class would orchestrate scheduler/worker processes
+    across inventory hosts and return a connected Client.
+    """
+    def __init__(self, logger: Optional[logging.Logger] = None) -> None:
+        super().__init__(logger)
+        self._client = None
+    def start(self, num_workers: int, options: Optional[Dict[str, Any]] = None) -> Tuple[Optional[Any], Optional[str]]:
+        options = options or {}
+        scheduler_address = options.get('scheduler_address')
+        if not scheduler_address:
+            # Try env var
+            import os
+            scheduler_address = os.environ.get('DASK_SCHEDULER_ADDRESS')
+        if not scheduler_address:
+            self.logger.error("AnsibleClusterProvider requires 'scheduler_address' in options or DASK_SCHEDULER_ADDRESS env var")
+            return None, None
+        try:
+            try:
+                from distributed import Client
+            except Exception:
+                from dask.distributed import Client
+            self._client = Client(scheduler_address)
+            self.logger.info(f"Connected to existing Dask scheduler at {scheduler_address}")
+            return self._client, scheduler_address
+        except Exception as e:
+            self.logger.error(f"Failed to connect to scheduler at {scheduler_address}: {e}")
+            return None, None
+    def stop(self) -> None:
+        if self._client is not None:
+            try:
+                self._client.close()
+            finally:
+                self._client = None

mlops/core/__init__.py ADDED Viewed

@@ -0,0 +1,95 @@
+"""
+MLOps Core Module (lazy-loading)
+Provides the core components for the NetworkX-based pipeline execution system.
+Heavy submodules are imported lazily on attribute access to minimize required
+runtime dependencies for lightweight utilities (e.g., pipeline_utils).
+"""
+from typing import Any
+import importlib
+__all__ = [
+    # step_system exports
+    "step",
+    "process",
+    "StepContext",
+    "StepContextFactory",
+    "StepDefinition",
+    "StepRegistry",
+    "ProcessDefinition",
+    "ProcessRegistry",
+    "get_step_registry",
+    "get_process_registry",
+    "get_current_context",
+    "set_current_context",
+    "get_context_factory",
+    "set_current_process_context",
+    "get_current_process_context",
+    "get_parameter_resolver",
+    "set_state_manager",
+    "get_state_manager",
+    "log_metric",
+    "SerializableData",
+    "ModelData",
+    # custom model
+    "MLOpsCustomModelBase",
+    # graph types + parser
+    "NetworkXGraphConfig",
+    "ProcessConfig",
+    "StepConfig",
+    "ExecutionResult",
+    "NodeType",
+    "NetworkXPipelineParser",
+    "parse_networkx_pipeline_from_config",
+    # state manager
+    "StepStateManager",
+]
+_lazy_attr_to_module = {
+    "step": ("mlops.core.step_system", "step"),
+    "process": ("mlops.core.step_system", "process"),
+    "StepContext": ("mlops.core.step_system", "StepContext"),
+    "StepContextFactory": ("mlops.core.step_system", "StepContextFactory"),
+    "StepDefinition": ("mlops.core.step_system", "StepDefinition"),
+    "StepRegistry": ("mlops.core.step_system", "StepRegistry"),
+    "ProcessDefinition": ("mlops.core.step_system", "ProcessDefinition"),
+    "ProcessRegistry": ("mlops.core.step_system", "ProcessRegistry"),
+    "get_step_registry": ("mlops.core.step_system", "get_step_registry"),
+    "get_process_registry": ("mlops.core.step_system", "get_process_registry"),
+    "get_current_context": ("mlops.core.step_system", "get_current_context"),
+    "set_current_context": ("mlops.core.step_system", "set_current_context"),
+    "get_context_factory": ("mlops.core.step_system", "get_context_factory"),
+    "set_current_process_context": ("mlops.core.step_system", "set_current_process_context"),
+    "get_current_process_context": ("mlops.core.step_system", "get_current_process_context"),
+    "get_parameter_resolver": ("mlops.core.step_system", "get_parameter_resolver"),
+    "set_state_manager": ("mlops.core.step_system", "set_state_manager"),
+    "get_state_manager": ("mlops.core.step_system", "get_state_manager"),
+    "log_metric": ("mlops.core.step_system", "log_metric"),
+    "SerializableData": ("mlops.core.step_system", "SerializableData"),
+    "ModelData": ("mlops.core.step_system", "ModelData"),
+    # custom model base
+    "MLOpsCustomModelBase": ("mlops.core.custom_model_base", "MLOpsCustomModelBase"),
+    # graph types + parser
+    "NetworkXGraphConfig": ("mlops.core.graph_types", "NetworkXGraphConfig"),
+    "ProcessConfig": ("mlops.core.graph_types", "ProcessConfig"),
+    "StepConfig": ("mlops.core.graph_types", "StepConfig"),
+    "ExecutionResult": ("mlops.core.graph_types", "ExecutionResult"),
+    "NodeType": ("mlops.core.graph_types", "NodeType"),
+    "NetworkXPipelineParser": ("mlops.core.networkx_parser", "NetworkXPipelineParser"),
+    "parse_networkx_pipeline_from_config": ("mlops.core.networkx_parser", "parse_networkx_pipeline_from_config"),
+    # state manager
+    "StepStateManager": ("mlops.core.step_state_manager", "StepStateManager"),
+}
+def __getattr__(name: str) -> Any:
+    if name in _lazy_attr_to_module:
+        module_name, attr_name = _lazy_attr_to_module[name]
+        module = importlib.import_module(module_name)
+        return getattr(module, attr_name)
+    raise AttributeError(f"module 'mlops.core' has no attribute '{name}'")
+def __dir__() -> list[str]:
+    return sorted(list(globals().keys()) + __all__)

mlops/core/custom_model_base.py ADDED Viewed

@@ -0,0 +1,38 @@
+from __future__ import annotations
+from typing import Any, Optional
+class MLOpsCustomModelBase:
+    """Lightweight base class for user-defined models.
+    This class is intentionally minimal: it stores hyperparameters and (when not
+    explicitly provided) tries to resolve process-scoped hyperparameters from the
+    active `StepContext`.
+    """
+    def __init__(self, hyperparameters: Optional[dict[str, Any]] = None) -> None:
+        """Initialize with hyperparameters.
+        If not provided, automatically resolve merged hyperparameters from the
+        active step context for the current process (global overrides -> process overrides).
+        """
+        if hyperparameters and isinstance(hyperparameters, dict):
+            self.hyperparameters = hyperparameters
+            return
+        try:
+            from .step_system import get_current_context
+            ctx = get_current_context()
+            if ctx and hasattr(ctx, 'get_hyperparameters'):
+                proc = getattr(ctx, 'current_process', None)
+                resolved = ctx.get_hyperparameters(proc)
+                self.hyperparameters = resolved if isinstance(resolved, dict) else {}
+            else:
+                self.hyperparameters = {}
+        except Exception:
+            self.hyperparameters = {}
+    def get_step_registry(self) -> Any:
+        """Get the step registry containing all @step decorated functions."""
+        from .step_system import get_step_registry
+        return get_step_registry()