PyPI - zenml-nightly - Versions diffs - 0.66.0.dev20240919__py3-none-any.whl → 0.66.0.dev20240927__py3-none-any.whl - Mend

zenml-nightly 0.66.0.dev20240919py3-none-any.whl → 0.66.0.dev20240927py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (155) hide show

zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py CHANGED Viewed

@@ -15,7 +15,16 @@
 import os
 import re
-from typing import TYPE_CHECKING, Dict, Optional, Tuple, Type, cast
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Dict,
+    Iterator,
+    Optional,
+    Tuple,
+    Type,
+    cast,
+)
 from uuid import UUID
 import boto3
@@ -25,13 +34,15 @@ from sagemaker.network import NetworkConfig
 from sagemaker.processing import ProcessingInput, ProcessingOutput
 from sagemaker.workflow.execution_variables import ExecutionVariables
 from sagemaker.workflow.pipeline import Pipeline
-from sagemaker.workflow.steps import ProcessingStep
+from sagemaker.workflow.steps import ProcessingStep, TrainingStep
 from zenml.config.base_settings import BaseSettings
 from zenml.constants import (
+    METADATA_ORCHESTRATOR_LOGS_URL,
+    METADATA_ORCHESTRATOR_RUN_ID,
     METADATA_ORCHESTRATOR_URL,
 )
-from zenml.enums import StackComponentType
+from zenml.enums import ExecutionStatus, StackComponentType
 from zenml.integrations.aws.flavors.sagemaker_orchestrator_flavor import (
     SagemakerOrchestratorConfig,
     SagemakerOrchestratorSettings,
@@ -48,7 +59,7 @@ from zenml.stack import StackValidator
 from zenml.utils.env_utils import split_environment_variables
 if TYPE_CHECKING:
-    from zenml.models import PipelineDeploymentResponse
+    from zenml.models import PipelineDeploymentResponse, PipelineRunResponse
     from zenml.stack import Stack
 ENV_ZENML_SAGEMAKER_RUN_ID = "ZENML_SAGEMAKER_RUN_ID"
@@ -58,6 +69,34 @@ POLLING_DELAY = 30
 logger = get_logger(__name__)
+def dissect_pipeline_execution_arn(
+    pipeline_execution_arn: str,
+) -> Tuple[Optional[str], Optional[str], Optional[str]]:
+    """Extract region name, pipeline name, and execution id from the ARN.
+    Args:
+        pipeline_execution_arn: the pipeline execution ARN
+    Returns:
+        Region Name, Pipeline Name, Execution ID in order
+    """
+    # Extract region_name
+    region_match = re.search(r"sagemaker:(.*?):", pipeline_execution_arn)
+    region_name = region_match.group(1) if region_match else None
+    # Extract pipeline_name
+    pipeline_match = re.search(
+        r"pipeline/(.*?)/execution", pipeline_execution_arn
+    )
+    pipeline_name = pipeline_match.group(1) if pipeline_match else None
+    # Extract execution_id
+    execution_match = re.search(r"execution/(.*)", pipeline_execution_arn)
+    execution_id = execution_match.group(1) if execution_match else None
+    return region_name, pipeline_name, execution_id
 class SagemakerOrchestrator(ContainerizedOrchestrator):
     """Orchestrator responsible for running pipelines on Sagemaker."""
@@ -136,42 +175,16 @@ class SagemakerOrchestrator(ContainerizedOrchestrator):
         """
         return SagemakerOrchestratorSettings
-    def prepare_or_run_pipeline(
-        self,
-        deployment: "PipelineDeploymentResponse",
-        stack: "Stack",
-        environment: Dict[str, str],
-    ) -> None:
-        """Prepares or runs a pipeline on Sagemaker.
+    def _get_sagemaker_session(self) -> sagemaker.Session:
+        """Method to create the sagemaker session with proper authentication.
-        Args:
-            deployment: The deployment to prepare or run.
-            stack: The stack to run on.
-            environment: Environment variables to set in the orchestration
-                environment.
+        Returns:
+            The Sagemaker Session.
         Raises:
-            RuntimeError: If a connector is used that does not return a
-                `boto3.Session` object.
-            TypeError: If the network_config passed is not compatible with the
-                AWS SageMaker NetworkConfig class.
+            RuntimeError: If the connector returns the wrong type for the
+                session.
         """
-        if deployment.schedule:
-            logger.warning(
-                "The Sagemaker Orchestrator currently does not support the "
-                "use of schedules. The `schedule` will be ignored "
-                "and the pipeline will be run immediately."
-            )
-        # sagemaker requires pipelineName to use alphanum and hyphens only
-        unsanitized_orchestrator_run_name = get_orchestrator_run_name(
-            pipeline_name=deployment.pipeline_configuration.name
-        )
-        # replace all non-alphanum and non-hyphens with hyphens
-        orchestrator_run_name = re.sub(
-            r"[^a-zA-Z0-9\-]", "-", unsanitized_orchestrator_run_name
-        )
         # Get authenticated session
         # Option 1: Service connector
         boto_session: boto3.Session
@@ -205,10 +218,51 @@ class SagemakerOrchestrator(ContainerizedOrchestrator):
                     aws_session_token=credentials["SessionToken"],
                     region_name=self.config.region,
                 )
-        session = sagemaker.Session(
+        return sagemaker.Session(
             boto_session=boto_session, default_bucket=self.config.bucket
         )
+    def prepare_or_run_pipeline(
+        self,
+        deployment: "PipelineDeploymentResponse",
+        stack: "Stack",
+        environment: Dict[str, str],
+    ) -> Iterator[Dict[str, MetadataType]]:
+        """Prepares or runs a pipeline on Sagemaker.
+        Args:
+            deployment: The deployment to prepare or run.
+            stack: The stack to run on.
+            environment: Environment variables to set in the orchestration
+                environment.
+        Raises:
+            RuntimeError: If a connector is used that does not return a
+                `boto3.Session` object.
+            TypeError: If the network_config passed is not compatible with the
+                AWS SageMaker NetworkConfig class.
+        Yields:
+            A dictionary of metadata related to the pipeline run.
+        """
+        if deployment.schedule:
+            logger.warning(
+                "The Sagemaker Orchestrator currently does not support the "
+                "use of schedules. The `schedule` will be ignored "
+                "and the pipeline will be run immediately."
+            )
+        # sagemaker requires pipelineName to use alphanum and hyphens only
+        unsanitized_orchestrator_run_name = get_orchestrator_run_name(
+            pipeline_name=deployment.pipeline_configuration.name
+        )
+        # replace all non-alphanum and non-hyphens with hyphens
+        orchestrator_run_name = re.sub(
+            r"[^a-zA-Z0-9\-]", "-", unsanitized_orchestrator_run_name
+        )
+        session = self._get_sagemaker_session()
         # Sagemaker does not allow environment variables longer than 256
         # characters to be passed to Processor steps. If an environment variable
         # is longer than 256 characters, we split it into multiple environment
@@ -238,54 +292,71 @@ class SagemakerOrchestrator(ContainerizedOrchestrator):
                 ExecutionVariables.PIPELINE_EXECUTION_ARN
             )
-            # Retrieve Processor arguments provided in the Step settings.
-            processor_args_for_step = step_settings.processor_args or {}
-            # Set default values from configured orchestrator Component to arguments
-            # to be used when they are not present in processor_args.
-            processor_args_for_step.setdefault(
-                "instance_type", step_settings.instance_type
+            use_training_step = (
+                step_settings.use_training_step
+                if step_settings.use_training_step is not None
+                else (
+                    self.config.use_training_step
+                    if self.config.use_training_step is not None
+                    else True
+                )
             )
-            processor_args_for_step.setdefault(
+            # Retrieve Executor arguments provided in the Step settings.
+            if use_training_step:
+                args_for_step_executor = step_settings.estimator_args or {}
+            else:
+                args_for_step_executor = step_settings.processor_args or {}
+            # Set default values from configured orchestrator Component to
+            # arguments to be used when they are not present in processor_args.
+            args_for_step_executor.setdefault(
                 "role",
-                step_settings.processor_role or self.config.execution_role,
+                step_settings.execution_role or self.config.execution_role,
             )
-            processor_args_for_step.setdefault(
+            args_for_step_executor.setdefault(
                 "volume_size_in_gb", step_settings.volume_size_in_gb
             )
-            processor_args_for_step.setdefault(
+            args_for_step_executor.setdefault(
                 "max_runtime_in_seconds", step_settings.max_runtime_in_seconds
             )
-            processor_args_for_step.setdefault(
+            tags = step_settings.tags
+            args_for_step_executor.setdefault(
                 "tags",
-                [
-                    {"Key": key, "Value": value}
-                    for key, value in step_settings.processor_tags.items()
-                ]
-                if step_settings.processor_tags
-                else None,
+                (
+                    [
+                        {"Key": key, "Value": value}
+                        for key, value in tags.items()
+                    ]
+                    if tags
+                    else None
+                ),
+            )
+            args_for_step_executor.setdefault(
+                "instance_type", step_settings.instance_type
             )
             # Set values that cannot be overwritten
-            processor_args_for_step["image_uri"] = image
-            processor_args_for_step["instance_count"] = 1
-            processor_args_for_step["sagemaker_session"] = session
-            processor_args_for_step["entrypoint"] = entrypoint
-            processor_args_for_step["base_job_name"] = orchestrator_run_name
-            processor_args_for_step["env"] = environment
-            # Convert network_config to sagemaker.network.NetworkConfig if present
-            network_config = processor_args_for_step.get("network_config")
+            args_for_step_executor["image_uri"] = image
+            args_for_step_executor["instance_count"] = 1
+            args_for_step_executor["sagemaker_session"] = session
+            args_for_step_executor["base_job_name"] = orchestrator_run_name
+            # Convert network_config to sagemaker.network.NetworkConfig if
+            # present
+            network_config = args_for_step_executor.get("network_config")
             if network_config and isinstance(network_config, dict):
                 try:
-                    processor_args_for_step["network_config"] = NetworkConfig(
+                    args_for_step_executor["network_config"] = NetworkConfig(
                         **network_config
                     )
                 except TypeError:
-                    # If the network_config passed is not compatible with the NetworkConfig class,
-                    # raise a more informative error.
+                    # If the network_config passed is not compatible with the
+                    # NetworkConfig class, raise a more informative error.
                     raise TypeError(
-                        "Expected a sagemaker.network.NetworkConfig compatible object for the network_config argument, "
+                        "Expected a sagemaker.network.NetworkConfig "
+                        "compatible object for the network_config argument, "
                         "but the network_config processor argument is invalid."
                         "See https://sagemaker.readthedocs.io/en/stable/api/utility/network.html#sagemaker.network.NetworkConfig "
                         "for more information about the NetworkConfig class."
@@ -317,17 +388,21 @@ class SagemakerOrchestrator(ContainerizedOrchestrator):
             # Construct S3 outputs from container for step
             outputs = None
+            output_path = None
             if step_settings.output_data_s3_uri is None:
                 pass
             elif isinstance(step_settings.output_data_s3_uri, str):
-                outputs = [
-                    ProcessingOutput(
-                        source="/opt/ml/processing/output/data",
-                        destination=step_settings.output_data_s3_uri,
-                        s3_upload_mode=step_settings.output_data_s3_mode,
-                    )
-                ]
+                if use_training_step:
+                    output_path = step_settings.output_data_s3_uri
+                else:
+                    outputs = [
+                        ProcessingOutput(
+                            source="/opt/ml/processing/output/data",
+                            destination=step_settings.output_data_s3_uri,
+                            s3_upload_mode=step_settings.output_data_s3_mode,
+                        )
+                    ]
             elif isinstance(step_settings.output_data_s3_uri, dict):
                 outputs = []
                 for (
@@ -342,17 +417,37 @@ class SagemakerOrchestrator(ContainerizedOrchestrator):
                         )
                     )
-            # Create Processor and ProcessingStep
-            processor = sagemaker.processing.Processor(
-                **processor_args_for_step
-            )
-            sagemaker_step = ProcessingStep(
-                name=step_name,
-                processor=processor,
-                depends_on=step.spec.upstream_steps,
-                inputs=inputs,
-                outputs=outputs,
-            )
+            if use_training_step:
+                # Create Estimator and TrainingStep
+                estimator = sagemaker.estimator.Estimator(
+                    keep_alive_period_in_seconds=step_settings.keep_alive_period_in_seconds,
+                    output_path=output_path,
+                    environment=environment,
+                    container_entry_point=entrypoint,
+                    **args_for_step_executor,
+                )
+                sagemaker_step = TrainingStep(
+                    name=step_name,
+                    depends_on=step.spec.upstream_steps,
+                    inputs=inputs,
+                    estimator=estimator,
+                )
+            else:
+                # Create Processor and ProcessingStep
+                processor = sagemaker.processing.Processor(
+                    entrypoint=entrypoint,
+                    env=environment,
+                    **args_for_step_executor,
+                )
+                sagemaker_step = ProcessingStep(
+                    name=step_name,
+                    processor=processor,
+                    depends_on=step.spec.upstream_steps,
+                    inputs=inputs,
+                    outputs=outputs,
+                )
             sagemaker_steps.append(sagemaker_step)
         # construct the pipeline from the sagemaker_steps
@@ -363,48 +458,37 @@ class SagemakerOrchestrator(ContainerizedOrchestrator):
         )
         pipeline.create(role_arn=self.config.execution_role)
-        pipeline_execution = pipeline.start()
+        execution = pipeline.start()
         logger.warning(
             "Steps can take 5-15 minutes to start running "
             "when using the Sagemaker Orchestrator."
         )
+        # Yield metadata based on the generated execution object
+        yield from self.compute_metadata(execution=execution)
         # mainly for testing purposes, we wait for the pipeline to finish
         if self.config.synchronous:
             logger.info(
                 "Executing synchronously. Waiting for pipeline to finish... \n"
-                "At this point you can `Ctrl-C` out without cancelling the execution."
+                "At this point you can `Ctrl-C` out without cancelling the "
+                "execution."
             )
             try:
-                pipeline_execution.wait(
+                execution.wait(
                     delay=POLLING_DELAY, max_attempts=MAX_POLLING_ATTEMPTS
                 )
                 logger.info("Pipeline completed successfully.")
             except WaiterError:
                 raise RuntimeError(
-                    "Timed out while waiting for pipeline execution to finish. For long-running "
-                    "pipelines we recommend configuring your orchestrator for asynchronous execution. "
+                    "Timed out while waiting for pipeline execution to "
+                    "finish. For long-running pipelines we recommend "
+                    "configuring your orchestrator for asynchronous execution. "
                     "The following command does this for you: \n"
-                    f"`zenml orchestrator update {self.name} --synchronous=False`"
+                    f"`zenml orchestrator update {self.name} "
+                    f"--synchronous=False`"
                 )
-    def _get_region_name(self) -> str:
-        """Returns the AWS region name.
-        Returns:
-            The region name.
-        Raises:
-            RuntimeError: If the region name cannot be retrieved.
-        """
-        try:
-            return cast(str, sagemaker.Session().boto_region_name)
-        except Exception as e:
-            raise RuntimeError(
-                "Unable to get region name. Please ensure that you have "
-                "configured your AWS credentials correctly."
-            ) from e
     def get_pipeline_run_metadata(
         self, run_id: UUID
     ) -> Dict[str, "MetadataType"]:
@@ -416,16 +500,17 @@ class SagemakerOrchestrator(ContainerizedOrchestrator):
         Returns:
             A dictionary of metadata.
         """
+        pipeline_execution_arn = os.environ[ENV_ZENML_SAGEMAKER_RUN_ID]
         run_metadata: Dict[str, "MetadataType"] = {
-            "pipeline_execution_arn": os.environ[ENV_ZENML_SAGEMAKER_RUN_ID],
+            "pipeline_execution_arn": pipeline_execution_arn,
         }
-        try:
-            region_name = self._get_region_name()
-        except RuntimeError:
-            logger.warning("Unable to get region name from AWS Sagemaker.")
-            return run_metadata
         aws_run_id = os.environ[ENV_ZENML_SAGEMAKER_RUN_ID].split("/")[-1]
+        region_name, _, _ = dissect_pipeline_execution_arn(
+            pipeline_execution_arn=pipeline_execution_arn
+        )
         orchestrator_logs_url = (
             f"https://{region_name}.console.aws.amazon.com/"
             f"cloudwatch/home?region={region_name}#logsV2:log-groups/log-group"
@@ -434,3 +519,173 @@ class SagemakerOrchestrator(ContainerizedOrchestrator):
         )
         run_metadata[METADATA_ORCHESTRATOR_URL] = Uri(orchestrator_logs_url)
         return run_metadata
+    def fetch_status(self, run: "PipelineRunResponse") -> ExecutionStatus:
+        """Refreshes the status of a specific pipeline run.
+        Args:
+            run: The run that was executed by this orchestrator.
+        Returns:
+            the actual status of the pipeline job.
+        Raises:
+            AssertionError: If the run was not executed by to this orchestrator.
+            ValueError: If it fetches an unknown state or if we can not fetch
+                the orchestrator run ID.
+        """
+        # Make sure that the stack exists and is accessible
+        if run.stack is None:
+            raise ValueError(
+                "The stack that the run was executed on is not available "
+                "anymore."
+            )
+        # Make sure that the run belongs to this orchestrator
+        assert (
+            self.id
+            == run.stack.components[StackComponentType.ORCHESTRATOR][0].id
+        )
+        # Initialize the Sagemaker client
+        session = self._get_sagemaker_session()
+        sagemaker_client = session.sagemaker_client
+        # Fetch the status of the _PipelineExecution
+        if METADATA_ORCHESTRATOR_RUN_ID in run.run_metadata:
+            run_id = run.run_metadata[METADATA_ORCHESTRATOR_RUN_ID].value
+        elif run.orchestrator_run_id is not None:
+            run_id = run.orchestrator_run_id
+        else:
+            raise ValueError(
+                "Can not find the orchestrator run ID, thus can not fetch "
+                "the status."
+            )
+        status = sagemaker_client.describe_pipeline_execution(
+            PipelineExecutionArn=run_id
+        )["PipelineExecutionStatus"]
+        # Map the potential outputs to ZenML ExecutionStatus. Potential values:
+        # https://cloud.google.com/vertex-ai/docs/reference/rest/v1beta1/PipelineState
+        if status in ["Executing", "Stopping"]:
+            return ExecutionStatus.RUNNING
+        elif status in ["Stopped", "Failed"]:
+            return ExecutionStatus.FAILED
+        elif status in ["Succeeded"]:
+            return ExecutionStatus.COMPLETED
+        else:
+            raise ValueError("Unknown status for the pipeline execution.")
+    def compute_metadata(
+        self, execution: Any
+    ) -> Iterator[Dict[str, MetadataType]]:
+        """Generate run metadata based on the generated Sagemaker Execution.
+        Args:
+            execution: The corresponding _PipelineExecution object.
+        Yields:
+            A dictionary of metadata related to the pipeline run.
+        """
+        # Metadata
+        metadata: Dict[str, MetadataType] = {}
+        # Orchestrator Run ID
+        if run_id := self._compute_orchestrator_run_id(execution):
+            metadata[METADATA_ORCHESTRATOR_RUN_ID] = run_id
+        # URL to the Sagemaker's pipeline view
+        if orchestrator_url := self._compute_orchestrator_url(execution):
+            metadata[METADATA_ORCHESTRATOR_URL] = Uri(orchestrator_url)
+        # URL to the corresponding CloudWatch page
+        if logs_url := self._compute_orchestrator_logs_url(execution):
+            metadata[METADATA_ORCHESTRATOR_LOGS_URL] = Uri(logs_url)
+        yield metadata
+    @staticmethod
+    def _compute_orchestrator_url(
+        pipeline_execution: Any,
+    ) -> Optional[str]:
+        """Generate the Orchestrator Dashboard URL upon pipeline execution.
+        Args:
+            pipeline_execution: The corresponding _PipelineExecution object.
+        Returns:
+             the URL to the dashboard view in SageMaker.
+        """
+        try:
+            region_name, pipeline_name, execution_id = (
+                dissect_pipeline_execution_arn(pipeline_execution.arn)
+            )
+            # Get the Sagemaker session
+            session = pipeline_execution.sagemaker_session
+            # List the Studio domains and get the Studio Domain ID
+            domains_response = session.sagemaker_client.list_domains()
+            studio_domain_id = domains_response["Domains"][0]["DomainId"]
+            return (
+                f"https://studio-{studio_domain_id}.studio.{region_name}."
+                f"sagemaker.aws/pipelines/view/{pipeline_name}/executions"
+                f"/{execution_id}/graph"
+            )
+        except Exception as e:
+            logger.warning(
+                f"There was an issue while extracting the pipeline url: {e}"
+            )
+            return None
+    @staticmethod
+    def _compute_orchestrator_logs_url(
+        pipeline_execution: Any,
+    ) -> Optional[str]:
+        """Generate the CloudWatch URL upon pipeline execution.
+        Args:
+            pipeline_execution: The corresponding _PipelineExecution object.
+        Returns:
+            the URL querying the pipeline logs in CloudWatch on AWS.
+        """
+        try:
+            region_name, _, execution_id = dissect_pipeline_execution_arn(
+                pipeline_execution.arn
+            )
+            return (
+                f"https://{region_name}.console.aws.amazon.com/"
+                f"cloudwatch/home?region={region_name}#logsV2:log-groups/log-group"
+                f"/$252Faws$252Fsagemaker$252FProcessingJobs$3FlogStreamNameFilter"
+                f"$3Dpipelines-{execution_id}-"
+            )
+        except Exception as e:
+            logger.warning(
+                f"There was an issue while extracting the logs url: {e}"
+            )
+            return None
+    @staticmethod
+    def _compute_orchestrator_run_id(
+        pipeline_execution: Any,
+    ) -> Optional[str]:
+        """Fetch the Orchestrator Run ID upon pipeline execution.
+        Args:
+            pipeline_execution: The corresponding _PipelineExecution object.
+        Returns:
+             the Execution ID of the run in SageMaker.
+        """
+        try:
+            return str(pipeline_execution.arn)
+        except Exception as e:
+            logger.warning(
+                f"There was an issue while extracting the pipeline run ID: {e}"
+            )
+            return None

zenml-nightly 0.66.0.dev20240919__py3-none-any.whl → 0.66.0.dev20240927__py3-none-any.whl

zenml-nightly 0.66.0.dev20240919py3-none-any.whl → 0.66.0.dev20240927py3-none-any.whl