PyPI - oracle-ads - Versions diffs - 2.13.10rc0__py3-none-any.whl → 2.13.12__py3-none-any.whl - Mend

oracle-ads 2.13.10rc0py3-none-any.whl → 2.13.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

ads/aqua/app.py +13 -7
ads/aqua/cli.py +15 -0
ads/aqua/common/entities.py +31 -5
ads/aqua/common/utils.py +35 -0
ads/aqua/config/container_config.py +0 -1
ads/aqua/evaluation/evaluation.py +5 -4
ads/aqua/extension/deployment_handler.py +4 -1
ads/aqua/extension/model_handler.py +1 -1
ads/aqua/model/enums.py +19 -1
ads/aqua/model/model.py +45 -36
ads/aqua/model/utils.py +1 -2
ads/aqua/modeldeployment/config_loader.py +815 -0
ads/aqua/modeldeployment/constants.py +4 -1
ads/aqua/modeldeployment/deployment.py +100 -124
ads/aqua/modeldeployment/entities.py +4 -178
ads/aqua/modeldeployment/model_group_config.py +240 -0
ads/aqua/modeldeployment/utils.py +0 -539
ads/common/work_request.py +39 -38
ads/jobs/builders/infrastructure/dsc_job.py +121 -24
ads/jobs/builders/infrastructure/dsc_job_runtime.py +71 -24
ads/jobs/builders/runtimes/base.py +7 -5
ads/jobs/builders/runtimes/pytorch_runtime.py +6 -8
ads/jobs/templates/driver_pytorch.py +486 -172
ads/jobs/templates/driver_utils.py +27 -11
ads/model/service/oci_datascience_model_deployment.py +6 -11
ads/telemetry/client.py +4 -4
{oracle_ads-2.13.10rc0.dist-info → oracle_ads-2.13.12.dist-info}/METADATA +2 -2
{oracle_ads-2.13.10rc0.dist-info → oracle_ads-2.13.12.dist-info}/RECORD +31 -29
{oracle_ads-2.13.10rc0.dist-info → oracle_ads-2.13.12.dist-info}/WHEEL +0 -0
{oracle_ads-2.13.10rc0.dist-info → oracle_ads-2.13.12.dist-info}/entry_points.txt +0 -0
{oracle_ads-2.13.10rc0.dist-info → oracle_ads-2.13.12.dist-info}/licenses/LICENSE.txt +0 -0

ads/common/work_request.py CHANGED Viewed

@@ -1,7 +1,5 @@
 #!/usr/bin/env python
-# -*- coding: utf-8; -*-
-# Copyright (c) 2024 Oracle and/or its affiliates.
+# Copyright (c) 2024, 2025 Oracle and/or its affiliates.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 import logging
@@ -12,6 +10,7 @@ from typing import Callable
 import oci
 from oci import Signer
 from tqdm.auto import tqdm
 from ads.common.oci_datascience import OCIDataScienceMixin
 logger = logging.getLogger(__name__)
@@ -20,10 +19,10 @@ WORK_REQUEST_STOP_STATE = ("SUCCEEDED", "FAILED", "CANCELED")
 DEFAULT_WAIT_TIME = 1200
 DEFAULT_POLL_INTERVAL = 10
 WORK_REQUEST_PERCENTAGE = 100
-# default tqdm progress bar format:
+# default tqdm progress bar format:
 # {l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, ' '{rate_fmt}{postfix}]
 # customize the bar format to remove the {n_fmt}/{total_fmt} from the right side
-DEFAULT_BAR_FORMAT = '{l_bar}{bar}| [{elapsed}<{remaining}, ' '{rate_fmt}{postfix}]'
+DEFAULT_BAR_FORMAT = "{l_bar}{bar}| [{elapsed}<{remaining}, " "{rate_fmt}{postfix}]"
 class DataScienceWorkRequest(OCIDataScienceMixin):
@@ -32,13 +31,13 @@ class DataScienceWorkRequest(OCIDataScienceMixin):
     """
     def __init__(
-        self,
-        id: str,
+        self,
+        id: str,
         description: str = "Processing",
-        config: dict = None,
-        signer: Signer = None,
-        client_kwargs: dict = None,
-        **kwargs
+        config: dict = None,
+        signer: Signer = None,
+        client_kwargs: dict = None,
+        **kwargs,
     ) -> None:
         """Initializes ADSWorkRequest object.
@@ -49,41 +48,43 @@ class DataScienceWorkRequest(OCIDataScienceMixin):
         description: str
             Progress bar initial step description (Defaults to `Processing`).
         config : dict, optional
-            OCI API key config dictionary to initialize
+            OCI API key config dictionary to initialize
             oci.data_science.DataScienceClient (Defaults to None).
         signer : oci.signer.Signer, optional
-            OCI authentication signer to initialize
+            OCI authentication signer to initialize
             oci.data_science.DataScienceClient (Defaults to None).
         client_kwargs : dict, optional
-            Additional client keyword arguments to initialize
+            Additional client keyword arguments to initialize
             oci.data_science.DataScienceClient (Defaults to None).
         kwargs:
-            Additional keyword arguments to initialize
+            Additional keyword arguments to initialize
             oci.data_science.DataScienceClient.
         """
         self.id = id
         self._description = description
         self._percentage = 0
         self._status = None
+        self._error_message = ""
         super().__init__(config, signer, client_kwargs, **kwargs)
     def _sync(self):
         """Fetches the latest work request information to ADSWorkRequest object."""
         work_request = self.client.get_work_request(self.id).data
-        work_request_logs = self.client.list_work_request_logs(
-            self.id
-        ).data
+        work_request_logs = self.client.list_work_request_logs(self.id).data
-        self._percentage= work_request.percent_complete
+        self._percentage = work_request.percent_complete
         self._status = work_request.status
-        self._description = work_request_logs[-1].message if work_request_logs else "Processing"
+        self._description = (
+            work_request_logs[-1].message if work_request_logs else "Processing"
+        )
+        if work_request.status == "FAILED":
+            self._error_message = self.client.list_work_request_errors(self.id).data
     def watch(
-        self,
+        self,
         progress_callback: Callable,
-        max_wait_time: int=DEFAULT_WAIT_TIME,
-        poll_interval: int=DEFAULT_POLL_INTERVAL,
+        max_wait_time: int = DEFAULT_WAIT_TIME,
+        poll_interval: int = DEFAULT_POLL_INTERVAL,
     ):
         """Updates the progress bar with realtime message and percentage until the process is completed.
@@ -92,10 +93,10 @@ class DataScienceWorkRequest(OCIDataScienceMixin):
         progress_callback: Callable
             Progress bar callback function.
             It must accept `(percent_change, description)` where `percent_change` is the
-            work request percent complete and `description` is the latest work request log message.
+            work request percent complete and `description` is the latest work request log message.
         max_wait_time: int
             Maximum amount of time to wait in seconds (Defaults to 1200).
-            Negative implies infinite wait time.
+            Negative implies infinite wait time.
         poll_interval: int
             Poll interval in seconds (Defaults to 10).
@@ -107,7 +108,6 @@ class DataScienceWorkRequest(OCIDataScienceMixin):
         start_time = time.time()
         while self._percentage < 100:
             seconds_since = time.time() - start_time
             if max_wait_time > 0 and seconds_since >= max_wait_time:
                 logger.error(f"Exceeded max wait time of {max_wait_time} seconds.")
@@ -124,12 +124,14 @@ class DataScienceWorkRequest(OCIDataScienceMixin):
             percent_change = self._percentage - previous_percent_complete
             previous_percent_complete = self._percentage
             progress_callback(
-                percent_change=percent_change,
-                description=self._description
+                percent_change=percent_change, description=self._description
             )
             if self._status in WORK_REQUEST_STOP_STATE:
-                if self._status != oci.work_requests.models.WorkRequest.STATUS_SUCCEEDED:
+                if (
+                    self._status
+                    != oci.work_requests.models.WorkRequest.STATUS_SUCCEEDED
+                ):
                     if self._description:
                         raise Exception(self._description)
                     else:
@@ -145,12 +147,12 @@ class DataScienceWorkRequest(OCIDataScienceMixin):
     def wait_work_request(
         self,
-        progress_bar_description: str="Processing",
-        max_wait_time: int=DEFAULT_WAIT_TIME,
-        poll_interval: int=DEFAULT_POLL_INTERVAL
+        progress_bar_description: str = "Processing",
+        max_wait_time: int = DEFAULT_WAIT_TIME,
+        poll_interval: int = DEFAULT_POLL_INTERVAL,
     ):
         """Waits for the work request progress bar to be completed.
         Parameters
         ----------
         progress_bar_description: str
@@ -160,7 +162,7 @@ class DataScienceWorkRequest(OCIDataScienceMixin):
             Negative implies infinite wait time.
         poll_interval: int
             Poll interval in seconds (Defaults to 10).
         Returns
         -------
         None
@@ -172,7 +174,7 @@ class DataScienceWorkRequest(OCIDataScienceMixin):
             mininterval=0,
             file=sys.stdout,
             desc=progress_bar_description,
-            bar_format=DEFAULT_BAR_FORMAT
+            bar_format=DEFAULT_BAR_FORMAT,
         ) as pbar:
             def progress_callback(percent_change, description):
@@ -184,6 +186,5 @@ class DataScienceWorkRequest(OCIDataScienceMixin):
             self.watch(
                 progress_callback=progress_callback,
                 max_wait_time=max_wait_time,
-                poll_interval=poll_interval
+                poll_interval=poll_interval,
             )

ads/jobs/builders/infrastructure/dsc_job.py CHANGED Viewed

@@ -1,7 +1,6 @@
 #!/usr/bin/env python
-# -*- coding: utf-8; -*-
-# Copyright (c) 2021, 2024 Oracle and/or its affiliates.
+# Copyright (c) 2021, 2025 Oracle and/or its affiliates.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 from __future__ import annotations
@@ -21,30 +20,33 @@ import fsspec
 import oci
 import oci.data_science
 import oci.util as oci_util
+import yaml
+from oci.data_science import models
 from oci.data_science.models import JobInfrastructureConfigurationDetails
 from oci.exceptions import ServiceError
-import yaml
 from ads.common import utils
+from ads.common.decorator.utils import class_or_instance_method
+from ads.common.dsc_file_system import (
+    DSCFileSystemManager,
+    OCIFileStorage,
+    OCIObjectStorage,
+)
 from ads.common.oci_datascience import DSCNotebookSession, OCIDataScienceMixin
 from ads.common.oci_logging import OCILog
 from ads.common.oci_resource import ResourceNotFoundError
 from ads.jobs.builders.infrastructure.base import Infrastructure, RunInstance
 from ads.jobs.builders.infrastructure.dsc_job_runtime import (
+    MULTI_NODE_JOB_SUPPORT,
     ContainerRuntimeHandler,
     DataScienceJobRuntimeManager,
 )
 from ads.jobs.builders.infrastructure.utils import get_value
 from ads.jobs.builders.runtimes.artifact import Artifact
+from ads.jobs.builders.runtimes.base import MultiNodeRuntime
 from ads.jobs.builders.runtimes.container_runtime import ContainerRuntime
 from ads.jobs.builders.runtimes.python_runtime import GitPythonRuntime
-from ads.common.dsc_file_system import (
-    OCIFileStorage,
-    DSCFileSystemManager,
-    OCIObjectStorage,
-)
-from ads.common.decorator.utils import class_or_instance_method
 logger = logging.getLogger(__name__)
 SLEEP_INTERVAL = 3
@@ -52,6 +54,7 @@ WAIT_SECONDS_AFTER_FINISHED = 90
 MAXIMUM_MOUNT_COUNT = 5
 FILE_STORAGE_TYPE = "FILE_STORAGE"
 OBJECT_STORAGE_TYPE = "OBJECT_STORAGE"
+DEFAULT_NODE_GROUP_NAME = "node-group"
 class DSCJob(OCIDataScienceMixin, oci.data_science.models.Job):
@@ -284,11 +287,15 @@ class DSCJob(OCIDataScienceMixin, oci.data_science.models.Job):
     def load_defaults(self) -> DSCJob:
         self.load_properties_from_env()
+        if getattr(self, "job_node_configuration_details", None):
+            return self
+        # Following are for single node job run only
         if not self.job_infrastructure_configuration_details:
             self.job_infrastructure_configuration_details = {}
         # Convert the dict to JobInfrastructureConfigurationDetails object
         if isinstance(self.job_infrastructure_configuration_details, dict):
-            # Default networking
             if not self.job_infrastructure_configuration_details.get(
                 "jobInfrastructureType"
             ):
@@ -352,6 +359,7 @@ class DSCJob(OCIDataScienceMixin, oci.data_science.models.Job):
             raise ValueError("Specify compartment ID for data science job.")
         if not self.project_id:
             raise ValueError("Specify project ID for data science job.")
         self._create_with_oci_api()
         return self
@@ -498,7 +506,9 @@ class DSCJob(OCIDataScienceMixin, oci.data_science.models.Job):
         keys = list(kwargs.keys())
         for key in keys:
             if key in config_swagger_types:
-                config_kwargs[key] = kwargs.pop(key)
+                val = kwargs.pop(key)
+                if val is not None:
+                    config_kwargs[key] = val
             elif key in env_config_swagger_types:
                 value = kwargs.pop(key)
                 if key in [
@@ -545,6 +555,25 @@ class DSCJob(OCIDataScienceMixin, oci.data_science.models.Job):
                 env_config_override
             )
+        if getattr(self, "job_node_configuration_details", None):
+            job_config_override = kwargs.pop("job_configuration_override_details", None)
+            env_config_override = kwargs.pop(
+                "job_environment_configuration_override_details", None
+            )
+            if job_config_override or env_config_override:
+                node_config = {
+                    "jobNodeType": "MULTI_NODE",
+                    "jobNodeGroupConfigurationDetailsList": [
+                        {
+                            # Node group name must match the node group name in the job.
+                            "name": DEFAULT_NODE_GROUP_NAME,
+                            "JobConfigurationDetails": job_config_override,
+                            "JobEnvironmentConfigurationDetails": env_config_override,
+                        }
+                    ],
+                }
+                kwargs["job_node_configuration_override_details"] = node_config
         wait = kwargs.pop("wait", False)
         run = DataScienceJobRun(**kwargs, **self.auth).create()
         if wait:
@@ -756,13 +785,11 @@ class DataScienceJobRun(
                 return True
             # Stop only if time_finished is over 2 minute ago.
             # This is for the time delay between job run stopped and the logs appear in oci logging.
-            if (
+            return (
                 datetime.datetime.now(self.time_finished.tzinfo)
                 - datetime.timedelta(seconds=wait)
                 > self.time_finished
-            ):
-                return True
-            return False
+            )
         if not self.log_id and not self.log_group_id:
             print(
@@ -1471,6 +1498,23 @@ class DataScienceJob(Infrastructure):
         }
         self.dsc_job = dsc_job
+        # Process multi-node infrastructure config
+        node_groups = get_value(
+            dsc_job,
+            "job_node_configuration_details.job_node_group_configuration_details_list",
+        )
+        if node_groups and len(node_groups) == 1:
+            node_group = node_groups[0]
+            dsc_job.job_infrastructure_configuration_details = (
+                node_group.job_infrastructure_configuration_details
+            )
+            subnet_id = get_value(
+                dsc_job,
+                "job_node_configuration_details.job_network_configuration.subnet_id",
+            )
+            if subnet_id:
+                self.set_spec(self.CONST_SUBNET_ID, subnet_id)
         for infra_attr, dsc_attr in self.payload_attribute_map.items():
             value = get_value(dsc_job, dsc_attr)
             if not value:
@@ -1557,10 +1601,13 @@ class DataScienceJob(Infrastructure):
             if value:
                 dsc_job.job_infrastructure_configuration_details[camel_attr] = value
-        if not dsc_job.job_infrastructure_configuration_details.get(
-            "shapeName", ""
-        ).endswith("Flex") and dsc_job.job_infrastructure_configuration_details.get(
-            "jobShapeConfigDetails"
+        shape = dsc_job.job_infrastructure_configuration_details.get("shapeName", "")
+        if (
+            shape
+            and not str(shape).endswith("Flex")
+            and dsc_job.job_infrastructure_configuration_details.get(
+                "jobShapeConfigDetails"
+            )
         ):
             raise ValueError(
                 "Shape config is not required for non flex shape from user end."
@@ -1583,7 +1630,6 @@ class DataScienceJob(Infrastructure):
         return self
     def build(self) -> DataScienceJob:
-        self.dsc_job.load_defaults()
         try:
             self.dsc_job.load_defaults()
@@ -1611,6 +1657,48 @@ class DataScienceJob(Infrastructure):
             )
         )
+    def _config_multi_node(self, runtime: MultiNodeRuntime):
+        """Configure the payload for multi-node job run."""
+        infra_config: dict = self.dsc_job.job_infrastructure_configuration_details
+        job_config: models.DefaultJobConfigurationDetails = (
+            self.dsc_job.job_configuration_details
+        )
+        env_config = self.dsc_job.job_environment_configuration_details
+        # For multi-node jobs,
+        # the job_infrastructure_configuration_details and job_configuration_details
+        # should be the special EMPTY class.
+        # The job_environment_configuration_details should be None.
+        # The configs will be specified in each node group.
+        self.dsc_job.job_infrastructure_configuration_details = None
+        self.dsc_job.job_configuration_details = None
+        self.dsc_job.job_environment_configuration_details = None
+        subnet_id = infra_config.pop("subnetId", None)
+        infra_config["jobInfrastructureType"] = (
+            models.MultiNodeJobInfrastructureConfigurationDetails.JOB_INFRASTRUCTURE_TYPE_MULTI_NODE
+        )
+        if subnet_id:
+            network_config = models.JobCustomNetworkConfiguration(subnet_id=subnet_id)
+        else:
+            network_config = models.JobDefaultNetworkConfiguration()
+        node_group_config: dict = {
+            "name": DEFAULT_NODE_GROUP_NAME,
+            "replicas": runtime.replica,
+            "minimumSuccessReplicas": runtime.replica,
+            "jobInfrastructureConfigurationDetails": infra_config,
+            "jobConfigurationDetails": job_config,
+            "jobEnvironmentConfigurationDetails": env_config,
+        }
+        self.dsc_job.job_node_configuration_details = {
+            "jobNodeType": "MULTI_NODE",
+            "startupOrder": "IN_PARALLEL",
+            "jobNetworkConfiguration": network_config,
+            "jobNodeGroupConfigurationDetailsList": [node_group_config],
+        }
     def create(self, runtime, **kwargs) -> DataScienceJob:
         """Creates a job with runtime.
@@ -1635,9 +1723,7 @@ class DataScienceJob(Infrastructure):
         if self.name:
             display_name = Template(self.name).safe_substitute(runtime.envs)
-        elif isinstance(runtime, GitPythonRuntime) or isinstance(
-            runtime, ContainerRuntime
-        ):
+        elif isinstance(runtime, (GitPythonRuntime, ContainerRuntime)):
             display_name = utils.get_random_name_for_resource()
         else:
             display_name = None
@@ -1652,11 +1738,22 @@ class DataScienceJob(Infrastructure):
         self.dsc_job = DSCJob(**payload, **self.auth)
         # Set Job infra to user values after DSCJob initialized the defaults
         self._update_job_infra(self.dsc_job)
+        if self.is_multi_node_job(runtime):
+            self._config_multi_node(runtime=runtime)
         self.dsc_job.create()
         # Update the model from infra after job creation.
         self._update_from_dsc_model(self.dsc_job)
         return self
+    @staticmethod
+    def is_multi_node_job(runtime):
+        """Check if the job is multi-node job."""
+        return (
+            MULTI_NODE_JOB_SUPPORT
+            and isinstance(runtime, MultiNodeRuntime)
+            and runtime.replica > 1
+        )
     def run(
         self,
         name=None,

ads/jobs/builders/infrastructure/dsc_job_runtime.py CHANGED Viewed

@@ -1,7 +1,6 @@
 #!/usr/bin/env python
-# -*- coding: utf-8; -*-
-# Copyright (c) 2021, 2024 Oracle and/or its affiliates.
+# Copyright (c) 2021, 2025 Oracle and/or its affiliates.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 """Contains classes for conversion between ADS runtime and OCI Data Science Job implementation.
 This module is for ADS developers only.
@@ -19,29 +18,37 @@ import os
 import shlex
 from typing import Optional
 from urllib import parse
+import oci
 from ads.common.utils import extract_region
+from ads.jobs.builders.infrastructure.utils import get_value
+from ads.jobs.builders.runtimes.artifact import (
+    GitPythonArtifact,
+    NotebookArtifact,
+    PythonArtifact,
+    ScriptArtifact,
+)
 from ads.jobs.builders.runtimes.base import Runtime
+from ads.jobs.builders.runtimes.container_runtime import ContainerRuntime
 from ads.jobs.builders.runtimes.python_runtime import (
     CondaRuntime,
-    ScriptRuntime,
-    PythonRuntime,
-    NotebookRuntime,
     GitPythonRuntime,
+    NotebookRuntime,
+    PythonRuntime,
+    ScriptRuntime,
 )
-from ads.jobs.builders.runtimes.container_runtime import ContainerRuntime
 from ads.jobs.builders.runtimes.pytorch_runtime import (
-    PyTorchDistributedRuntime,
     PyTorchDistributedArtifact,
+    PyTorchDistributedRuntime,
 )
-from ads.jobs.builders.runtimes.artifact import (
-    ScriptArtifact,
-    NotebookArtifact,
-    PythonArtifact,
-    GitPythonArtifact,
-)
-from ads.opctl.distributed.common import cluster_config_helper
-from ads.jobs.builders.infrastructure.utils import get_value
 from ads.jobs.templates import driver_utils
+from ads.opctl.distributed.common import cluster_config_helper
+if hasattr(oci.data_science.models, "MultiNodeJobInfrastructureConfigurationDetails"):
+    MULTI_NODE_JOB_SUPPORT = True
+else:
+    MULTI_NODE_JOB_SUPPORT = False
 class IncompatibleRuntime(Exception):  # pragma: no cover
@@ -77,6 +84,9 @@ class RuntimeHandler:
     # Defines the class of the runtime to be handled.
     RUNTIME_CLASS = Runtime
+    CONST_WORKER_COUNT = "OCI__WORKER_COUNT"
+    CONST_NODE_COUNT = "NODE_COUNT"
     def __init__(self, data_science_job) -> None:
         """Initialize the runtime handler.
@@ -285,7 +295,7 @@ class RuntimeHandler:
         * _extract_artifact()
         * _extract_runtime_minutes()
         Each of these method returns a dict for specifying the runtime.
-        The dictionaries are combined before initalizing the runtime.
+        The dictionaries are combined before initializing the runtime.
         A sub-class can modify one of more of these methods.
         Parameters
@@ -349,6 +359,30 @@ class RuntimeHandler:
             return {Runtime.CONST_ARGS: shlex.split(args_string)}
         return {}
+    def _get_node_group(self, dsc_job):
+        """Gets the node group for multi-node job with single node group."""
+        node_groups = get_value(
+            dsc_job,
+            "job_node_configuration_details.job_node_group_configuration_details_list",
+        )
+        if node_groups and len(node_groups) == 1:
+            return node_groups[0]
+        return None
+    def _get_replica(self, dsc_job, envs):
+        node_group = self._get_node_group(dsc_job)
+        if node_group:
+            replica = get_value(node_group, "replicas")
+        elif not envs:
+            replica = None
+        elif self.CONST_WORKER_COUNT in envs:
+            replica = int(envs.pop(self.CONST_WORKER_COUNT)) + 1
+        elif self.CONST_NODE_COUNT in envs:
+            replica = int(envs.pop(self.CONST_NODE_COUNT))
+        else:
+            replica = None
+        return replica
     def _extract_envs(self, dsc_job):
         """Extract the environment variables from data science job.
@@ -362,7 +396,12 @@ class RuntimeHandler:
         dict
             A runtime specification dictionary for initializing a runtime.
         """
-        envs = get_value(dsc_job, "job_configuration_details.environment_variables")
+        env_attr = "job_configuration_details.environment_variables"
+        node_group = self._get_node_group(dsc_job)
+        if node_group:
+            envs = get_value(node_group, env_attr)
+        else:
+            envs = get_value(dsc_job, env_attr)
         if envs:
             return {Runtime.CONST_ENV_VAR: envs}
         return {}
@@ -968,6 +1007,12 @@ class ContainerRuntimeHandler(RuntimeHandler):
         payload["job_environment_configuration_details"] = job_env_config
         return payload
+    def _translate_env(self, runtime):
+        envs = super()._translate_env(runtime)
+        if runtime.replica:
+            envs[self.CONST_NODE_COUNT] = str(runtime.replica)
+        return envs
     def _translate_artifact(self, runtime: ContainerRuntime):
         """Additional artifact for the container"""
         if runtime.artifact_uri:
@@ -1049,6 +1094,10 @@ class ContainerRuntimeHandler(RuntimeHandler):
         if envs:
             spec[ContainerRuntime.CONST_ENV_VAR] = envs
+        replica = self._get_replica(dsc_job=dsc_job, envs=envs)
+        if replica:
+            spec[ContainerRuntime.CONST_REPLICA] = replica
         return spec
     def _extract_properties(self, dsc_job) -> dict:
@@ -1081,7 +1130,6 @@ class ContainerRuntimeHandler(RuntimeHandler):
 class PyTorchDistributedRuntimeHandler(PythonRuntimeHandler):
     RUNTIME_CLASS = PyTorchDistributedRuntime
-    CONST_WORKER_COUNT = "OCI__WORKER_COUNT"
     CONST_COMMAND = "OCI__LAUNCH_CMD"
     CONST_DEEPSPEED = "OCI__DEEPSPEED"
@@ -1105,8 +1153,7 @@ class PyTorchDistributedRuntimeHandler(PythonRuntimeHandler):
     def _translate_env(self, runtime: PyTorchDistributedRuntime) -> dict:
         envs = super()._translate_env(runtime)
         replica = runtime.replica if runtime.replica else 1
-        # WORKER_COUNT = REPLICA - 1 so that it will be same as distributed training
-        envs[self.CONST_WORKER_COUNT] = str(replica - 1)
+        envs[self.CONST_NODE_COUNT] = str(replica)
         envs[self.CONST_JOB_ENTRYPOINT] = PyTorchDistributedArtifact.CONST_DRIVER_SCRIPT
         if runtime.inputs:
             envs[driver_utils.CONST_ENV_INPUT_MAPPINGS] = json.dumps(runtime.inputs)
@@ -1131,12 +1178,12 @@ class PyTorchDistributedRuntimeHandler(PythonRuntimeHandler):
     def _extract_envs(self, dsc_job) -> dict:
         spec = super()._extract_envs(dsc_job)
         envs = spec.pop(PythonRuntime.CONST_ENV_VAR, {})
-        if self.CONST_WORKER_COUNT not in envs:
+        replica = self._get_replica(dsc_job, envs=envs)
+        if not replica:
             raise IncompatibleRuntime()
         # Replicas
-        spec[PyTorchDistributedRuntime.CONST_REPLICA] = (
-            int(envs.pop(self.CONST_WORKER_COUNT)) + 1
-        )
+        spec[PyTorchDistributedRuntime.CONST_REPLICA] = replica
         # Git
         if cluster_config_helper.OCI__RUNTIME_URI in envs:
             git_spec = {}

ads/jobs/builders/runtimes/base.py CHANGED Viewed

@@ -1,17 +1,16 @@
 #!/usr/bin/env python
-# -*- coding: utf-8; -*-
-# Copyright (c) 2022, 2024 Oracle and/or its affiliates.
+# Copyright (c) 2022, 2025 Oracle and/or its affiliates.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 from __future__ import annotations
 import re
 import time
 import traceback
 from typing import Dict, TypeVar
-from ads.jobs.builders.base import Builder
-from ads.jobs import env_var_parser
+from ads.jobs import env_var_parser
+from ads.jobs.builders.base import Builder
 Self = TypeVar("Self", bound="Runtime")
@@ -285,6 +284,9 @@ class MultiNodeRuntime(Runtime):
     def run(self, dsc_job, **kwargs):
         """Starts the job runs"""
+        # For multi-node job, there is no need to create multiple job run.
+        if getattr(dsc_job, "job_node_configuration_details", None):
+            return dsc_job.run(**kwargs)
         replicas = self.replica if self.replica else 1
         main_run = None
         job_runs = []

oracle-ads 2.13.10rc0__py3-none-any.whl → 2.13.12__py3-none-any.whl

oracle-ads 2.13.10rc0py3-none-any.whl → 2.13.12py3-none-any.whl