PyPI - snowflake-ml-python - Versions diffs - 1.11.0__py3-none-any.whl → 1.13.0__py3-none-any.whl - Mend

snowflake-ml-python 1.11.0py3-none-any.whl → 1.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (198) hide show

snowflake/ml/feature_store/feature_view.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 import json
+import logging
 import re
 import warnings
 from collections import OrderedDict
@@ -31,10 +32,12 @@ from snowflake.snowpark.types import (
     _NumericType,
 )
+_DEFAULT_TARGET_LAG = "10 seconds"
 _FEATURE_VIEW_NAME_DELIMITER = "$"
 _LEGACY_TIMESTAMP_COL_PLACEHOLDER_VALS = ["FS_TIMESTAMP_COL_PLACEHOLDER_VAL", "NULL"]
 _TIMESTAMP_COL_PLACEHOLDER = "NULL"
 _FEATURE_OBJ_TYPE = "FEATURE_OBJ_TYPE"
+_ONLINE_TABLE_SUFFIX = "$ONLINE"
 # Feature view version rule is aligned with dataset version rule in SQL.
 _FEATURE_VIEW_VERSION_RE = re.compile(r"^[a-zA-Z0-9][a-zA-Z0-9_.\-]*$")
 _FEATURE_VIEW_VERSION_MAX_LENGTH = 128
@@ -45,6 +48,44 @@ _RESULT_SCAN_QUERY_PATTERN = re.compile(
 )
+@dataclass(frozen=True)
+class OnlineConfig:
+    """Configuration for online feature storage."""
+    enable: bool = False
+    target_lag: Optional[str] = None
+    def __post_init__(self) -> None:
+        if self.target_lag is None:
+            return
+        if not isinstance(self.target_lag, str) or not self.target_lag.strip():
+            raise ValueError("target_lag must be a non-empty string")
+        object.__setattr__(self, "target_lag", self.target_lag.strip())
+    def to_json(self) -> str:
+        data: dict[str, Any] = asdict(self)
+        return json.dumps(data)
+    @classmethod
+    def from_json(cls, json_str: str) -> OnlineConfig:
+        data = json.loads(json_str)
+        return cls(**data)
+class StoreType(Enum):
+    """
+    Enumeration for specifying the storage type when reading from or refreshing feature views.
+    The Feature View supports two storage modes:
+    - OFFLINE: Traditional batch storage for historical feature data and training
+    - ONLINE: Low-latency storage optimized for real-time feature serving
+    """
+    ONLINE = "online"
+    OFFLINE = "offline"
 @dataclass(frozen=True)
 class _FeatureViewMetadata:
     """Represent metadata tracked on top of FV backend object"""
@@ -171,6 +212,7 @@ class FeatureView(lineage_node.LineageNode):
         initialize: str = "ON_CREATE",
         refresh_mode: str = "AUTO",
         cluster_by: Optional[list[str]] = None,
+        online_config: Optional[OnlineConfig] = None,
         **_kwargs: Any,
     ) -> None:
         """
@@ -204,6 +246,8 @@ class FeatureView(lineage_node.LineageNode):
             cluster_by: Columns to cluster the feature view by.
                 - Defaults to the join keys from entities.
                 - If `timestamp_col` is provided, it is added to the default clustering keys.
+            online_config: Optional configuration for online storage. If provided with enable=True,
+                online storage will be enabled. Defaults to None (no online storage).
             _kwargs: reserved kwargs for system generated args. NOTE: DO NOT USE.
         Example::
@@ -227,9 +271,26 @@ class FeatureView(lineage_node.LineageNode):
             >>> registered_fv = fs.register_feature_view(draft_fv, "v1")
             >>> print(registered_fv.status)
             FeatureViewStatus.ACTIVE
+            <BLANKLINE>
+            >>> # Example with online configuration for online feature storage
+            >>> config = OnlineConfig(enable=True, target_lag='15s')
+            >>> online_fv = FeatureView(
+            ...     name="my_online_fv",
+            ...     entities=[e1, e2],
+            ...     feature_df=feature_df,
+            ...     timestamp_col='TS',
+            ...     refresh_freq='1d',
+            ...     desc='Feature view with online storage',
+            ...     online_config=config  # optional, enables online feature storage
+            ... )
+            >>> registered_online_fv = fs.register_feature_view(online_fv, "v1")
+            >>> print(registered_online_fv.online)
+            True
         # noqa: DAR401
         """
+        if online_config is not None:
+            logging.warning("'online_config' is in private preview since 1.12.0. Do not use it in production.")
         self._name: SqlIdentifier = SqlIdentifier(name)
         self._entities: list[Entity] = entities
@@ -257,6 +318,7 @@ class FeatureView(lineage_node.LineageNode):
         self._cluster_by: list[SqlIdentifier] = (
             [SqlIdentifier(col) for col in cluster_by] if cluster_by is not None else self._get_default_cluster_by()
         )
+        self._online_config: Optional[OnlineConfig] = online_config
         # Validate kwargs
         if _kwargs:
@@ -470,6 +532,31 @@ class FeatureView(lineage_node.LineageNode):
     def feature_descs(self) -> Optional[dict[SqlIdentifier, str]]:
         return self._feature_desc
+    @property
+    def online(self) -> bool:
+        return self._online_config.enable if self._online_config else False
+    @property
+    def online_config(self) -> Optional[OnlineConfig]:
+        return self._online_config
+    def fully_qualified_online_table_name(self) -> str:
+        """Get the fully qualified name for the online feature table.
+        Returns:
+            The fully qualified name (<database_name>.<schema_name>.<online_table_name>) for the
+            online feature table in Snowflake.
+        Raises:
+            RuntimeError: if the FeatureView is not registered or not configured for online storage.
+        """
+        if self.status == FeatureViewStatus.DRAFT or self.version is None:
+            raise RuntimeError(f"FeatureView {self.name} has not been registered.")
+        if not self.online:
+            raise RuntimeError(f"FeatureView {self.name} is not configured for online storage.")
+        online_table_name = self._get_online_table_name(self.name, self.version)
+        return f"{self._database}.{self._schema}.{online_table_name}"
     def list_columns(self) -> DataFrame:
         """List all columns and their information.
@@ -756,6 +843,8 @@ Got {len(self._feature_df.queries['queries'])}: {self._feature_df.queries['queri
                 feature_desc_dict[k.identifier()] = v
             fv_dict["_feature_desc"] = feature_desc_dict
+        fv_dict["_online_config"] = self._online_config.to_json() if self._online_config is not None else None
         lineage_node_keys = [key for key in fv_dict if key.startswith("_node") or key == "_session"]
         for key in lineage_node_keys:
@@ -844,6 +933,9 @@ Got {len(self._feature_df.queries['queries'])}: {self._feature_df.queries['queri
             owner=json_dict["_owner"],
             infer_schema_df=session.sql(json_dict.get("_infer_schema_query", None)),
             session=session,
+            online_config=OnlineConfig.from_json(json_dict["_online_config"])
+            if json_dict.get("_online_config")
+            else None,
         )
     def _get_compact_repr(self) -> _CompactRepresentation:
@@ -916,6 +1008,7 @@ Got {len(self._feature_df.queries['queries'])}: {self._feature_df.queries['queri
         infer_schema_df: Optional[DataFrame],
         session: Session,
         cluster_by: Optional[list[str]] = None,
+        online_config: Optional[OnlineConfig] = None,
     ) -> FeatureView:
         fv = FeatureView(
             name=name,
@@ -925,6 +1018,7 @@ Got {len(self._feature_df.queries['queries'])}: {self._feature_df.queries['queri
             desc=desc,
             _infer_schema_df=infer_schema_df,
             cluster_by=cluster_by,
+            online_config=online_config,
         )
         fv._version = FeatureViewVersion(version) if version is not None else None
         fv._status = status
@@ -961,5 +1055,33 @@ Got {len(self._feature_df.queries['queries'])}: {self._feature_df.queries['queri
         return default_cluster_by_cols
+    @staticmethod
+    def _get_online_table_name(
+        feature_view_name: Union[SqlIdentifier, str], version: Optional[Union[FeatureViewVersion, str]] = None
+    ) -> SqlIdentifier:
+        """Get the online feature table name without qualification.
+        Args:
+            feature_view_name: Offline feature view name.
+            version: Feature view version. If not provided, feature_view_name must be a SqlIdentifier.
+        Returns:
+            The online table name SqlIdentifier
+        """
+        if version is None:
+            assert isinstance(feature_view_name, SqlIdentifier), "Single argument must be SqlIdentifier"
+            online_name = f"{feature_view_name.resolved()}{_ONLINE_TABLE_SUFFIX}"
+            return SqlIdentifier(online_name, case_sensitive=True)
+        else:
+            fv_name = (
+                feature_view_name
+                if isinstance(feature_view_name, SqlIdentifier)
+                else SqlIdentifier(feature_view_name, case_sensitive=True)
+            )
+            fv_version = version if isinstance(version, FeatureViewVersion) else FeatureViewVersion(version)
+            physical_name = FeatureView._get_physical_name(fv_name, fv_version).resolved()
+            online_name = f"{physical_name}{_ONLINE_TABLE_SUFFIX}"
+            return SqlIdentifier(online_name, case_sensitive=True)
 lineage_node.DOMAIN_LINEAGE_REGISTRY["feature_view"] = FeatureView

snowflake/ml/jobs/_utils/constants.py CHANGED Viewed

@@ -3,26 +3,23 @@ from snowflake.ml.jobs._utils.types import ComputeResources
 # SPCS specification constants
 DEFAULT_CONTAINER_NAME = "main"
+MEMORY_VOLUME_NAME = "dshm"
+STAGE_VOLUME_NAME = "stage-volume"
+# Environment variables
+STAGE_MOUNT_PATH_ENV_VAR = "MLRS_STAGE_MOUNT_PATH"
 PAYLOAD_DIR_ENV_VAR = "MLRS_PAYLOAD_DIR"
 RESULT_PATH_ENV_VAR = "MLRS_RESULT_PATH"
 MIN_INSTANCES_ENV_VAR = "MLRS_MIN_INSTANCES"
 TARGET_INSTANCES_ENV_VAR = "SNOWFLAKE_JOBS_COUNT"
 RUNTIME_IMAGE_TAG_ENV_VAR = "MLRS_CONTAINER_IMAGE_TAG"
-MEMORY_VOLUME_NAME = "dshm"
-STAGE_VOLUME_NAME = "stage-volume"
-# Base mount path
-STAGE_VOLUME_MOUNT_PATH = "/mnt/job_stage"
-# Stage subdirectory paths
+# Stage mount paths
+STAGE_VOLUME_MOUNT_PATH = "/mnt/job_stage"
 APP_STAGE_SUBPATH = "app"
 SYSTEM_STAGE_SUBPATH = "system"
 OUTPUT_STAGE_SUBPATH = "output"
-# Complete mount paths (automatically generated from base + subpath)
-APP_MOUNT_PATH = f"{STAGE_VOLUME_MOUNT_PATH}/{APP_STAGE_SUBPATH}"
-SYSTEM_MOUNT_PATH = f"{STAGE_VOLUME_MOUNT_PATH}/{SYSTEM_STAGE_SUBPATH}"
-OUTPUT_MOUNT_PATH = f"{STAGE_VOLUME_MOUNT_PATH}/{OUTPUT_STAGE_SUBPATH}"
+RESULT_PATH_DEFAULT_VALUE = f"{OUTPUT_STAGE_SUBPATH}/mljob_result.pkl"
 # Default container image information
 DEFAULT_IMAGE_REPO = "/snowflake/images/snowflake_images"
@@ -59,8 +56,6 @@ ENABLE_HEALTH_CHECKS = "false"
 JOB_POLL_INITIAL_DELAY_SECONDS = 0.1
 JOB_POLL_MAX_DELAY_SECONDS = 30
-RESULT_PATH_DEFAULT_VALUE = f"{OUTPUT_MOUNT_PATH}/mljob_result.pkl"
 # Log start and end messages
 LOG_START_MSG = "--------------------------------\nML job started\n--------------------------------"
 LOG_END_MSG = "--------------------------------\nML job finished\n--------------------------------"
@@ -98,6 +93,3 @@ CLOUD_INSTANCE_FAMILIES = {
     SnowflakeCloudType.AWS: AWS_INSTANCE_FAMILIES,
     SnowflakeCloudType.AZURE: AZURE_INSTANCE_FAMILIES,
 }
-# runtime version environment variable
-ENABLE_IMAGE_VERSION_ENV_VAR = "MLRS_ENABLE_RUNTIME_VERSIONS"

snowflake/ml/jobs/_utils/feature_flags.py ADDED Viewed

@@ -0,0 +1,16 @@
+import os
+from enum import Enum
+class FeatureFlags(Enum):
+    USE_SUBMIT_JOB_V2 = "MLRS_USE_SUBMIT_JOB_V2"
+    ENABLE_IMAGE_VERSION_ENV_VAR = "MLRS_ENABLE_RUNTIME_VERSIONS"
+    def is_enabled(self) -> bool:
+        return os.getenv(self.value, "false").lower() == "true"
+    def is_disabled(self) -> bool:
+        return not self.is_enabled()
+    def __str__(self) -> str:
+        return self.value

snowflake/ml/jobs/_utils/payload_utils.py CHANGED Viewed

@@ -60,7 +60,7 @@ _STARTUP_SCRIPT_CODE = textwrap.dedent(
     # Change directory to user payload directory
     if [ -n "${constants.PAYLOAD_DIR_ENV_VAR}" ]; then
-        cd ${constants.PAYLOAD_DIR_ENV_VAR}
+        cd ${constants.STAGE_MOUNT_PATH_ENV_VAR}/${constants.PAYLOAD_DIR_ENV_VAR}
     fi
     ##### Set up Python environment #####
@@ -69,7 +69,10 @@ _STARTUP_SCRIPT_CODE = textwrap.dedent(
     if [ -f "${{MLRS_SYSTEM_REQUIREMENTS_FILE}}" ]; then
         echo "Installing packages from $MLRS_SYSTEM_REQUIREMENTS_FILE"
-        pip install -r $MLRS_SYSTEM_REQUIREMENTS_FILE
+        if ! pip install --no-index -r $MLRS_SYSTEM_REQUIREMENTS_FILE; then
+            echo "Offline install failed, falling back to regular pip install"
+            pip install -r $MLRS_SYSTEM_REQUIREMENTS_FILE
+        fi
     fi
     MLRS_REQUIREMENTS_FILE=${{MLRS_REQUIREMENTS_FILE:-"requirements.txt"}}
@@ -535,19 +538,30 @@ class JobPayload:
         upload_system_resources(session, system_stage_path)
         python_entrypoint: list[Union[str, PurePath]] = [
-            PurePath(f"{constants.SYSTEM_MOUNT_PATH}/mljob_launcher.py"),
-            PurePath(f"{constants.APP_MOUNT_PATH}/{entrypoint.file_path.relative_to(source).as_posix()}"),
+            PurePath(constants.STAGE_VOLUME_MOUNT_PATH, constants.SYSTEM_STAGE_SUBPATH, "mljob_launcher.py"),
+            PurePath(
+                constants.STAGE_VOLUME_MOUNT_PATH,
+                constants.APP_STAGE_SUBPATH,
+                entrypoint.file_path.relative_to(source).as_posix(),
+            ),
         ]
         if entrypoint.main_func:
             python_entrypoint += ["--script_main_func", entrypoint.main_func]
+        env_vars = {
+            constants.STAGE_MOUNT_PATH_ENV_VAR: constants.STAGE_VOLUME_MOUNT_PATH,
+            constants.PAYLOAD_DIR_ENV_VAR: constants.APP_STAGE_SUBPATH,
+            constants.RESULT_PATH_ENV_VAR: constants.RESULT_PATH_DEFAULT_VALUE,
+        }
         return types.UploadedPayload(
             stage_path=stage_path,
             entrypoint=[
                 "bash",
-                f"{constants.SYSTEM_MOUNT_PATH}/{_STARTUP_SCRIPT_PATH}",
+                f"{constants.STAGE_VOLUME_MOUNT_PATH}/{constants.SYSTEM_STAGE_SUBPATH}/{_STARTUP_SCRIPT_PATH}",
                 *python_entrypoint,
             ],
+            env_vars=env_vars,
         )

snowflake/ml/jobs/_utils/scripts/get_instance_ip.py CHANGED Viewed

@@ -41,18 +41,29 @@ def get_first_instance(service_name: str) -> Optional[tuple[str, str, str]]:
     from snowflake.runtime.utils import session_utils
     session = session_utils.get_session()
-    df = session.sql(f"show service instances in service {service_name}")
-    result = df.select('"instance_id"', '"ip_address"', '"start_time"', '"status"').collect()
+    result = session.sql(f"show service instances in service {service_name}").collect()
     if not result:
         return None
-    # Sort by start_time first, then by instance_id. If start_time is null/empty, it will be sorted to the end.
-    sorted_instances = sorted(result, key=lambda x: (not bool(x["start_time"]), x["start_time"], int(x["instance_id"])))
-    head_instance = sorted_instances[0]
+    # we have already integrated with first_instance startup policy,
+    # the instance 0 is guaranteed to be the head instance
+    head_instance = next(
+        (
+            row
+            for row in result
+            if "instance_id" in row and row["instance_id"] is not None and int(row["instance_id"]) == 0
+        ),
+        None,
+    )
+    # fallback to find the first instance if the instance 0 is not found
+    if not head_instance:
+        # Sort by start_time first, then by instance_id. If start_time is null/empty, it will be sorted to the end.
+        sorted_instances = sorted(
+            result, key=lambda x: (not bool(x["start_time"]), x["start_time"], int(x["instance_id"]))
+        )
+        head_instance = sorted_instances[0]
     if not head_instance["instance_id"] or not head_instance["ip_address"]:
         return None
     # Validate head instance IP
     ip_address = head_instance["ip_address"]
     try:

snowflake/ml/jobs/_utils/scripts/mljob_launcher.py CHANGED Viewed

@@ -48,8 +48,8 @@ MIN_INSTANCES_ENV_VAR = getattr(constants, "MIN_INSTANCES_ENV_VAR", "MLRS_MIN_IN
 TARGET_INSTANCES_ENV_VAR = getattr(constants, "TARGET_INSTANCES_ENV_VAR", "SNOWFLAKE_JOBS_COUNT")
 # Fallbacks in case of SnowML version mismatch
+STAGE_MOUNT_PATH_ENV_VAR = getattr(constants, "STAGE_MOUNT_PATH_ENV_VAR", "MLRS_STAGE_MOUNT_PATH")
 RESULT_PATH_ENV_VAR = getattr(constants, "RESULT_PATH_ENV_VAR", "MLRS_RESULT_PATH")
-JOB_RESULT_PATH = os.environ.get(RESULT_PATH_ENV_VAR, "/mnt/job_stage/output/mljob_result.pkl")
 PAYLOAD_DIR_ENV_VAR = getattr(constants, "PAYLOAD_DIR_ENV_VAR", "MLRS_PAYLOAD_DIR")
 # Constants for the wait_for_instances function
@@ -57,6 +57,9 @@ MIN_WAIT_TIME = float(os.getenv("MLRS_INSTANCES_MIN_WAIT") or -1)  # seconds
 TIMEOUT = float(os.getenv("MLRS_INSTANCES_TIMEOUT") or 720)  # seconds
 CHECK_INTERVAL = float(os.getenv("MLRS_INSTANCES_CHECK_INTERVAL") or 10)  # seconds
+STAGE_MOUNT_PATH = os.environ.get(STAGE_MOUNT_PATH_ENV_VAR, "/mnt/job_stage")
+JOB_RESULT_PATH = os.environ.get(RESULT_PATH_ENV_VAR, "output/mljob_result.pkl")
 try:
     from snowflake.ml.jobs._utils.interop_utils import ExecutionResult
@@ -226,12 +229,16 @@ def run_script(script_path: str, *script_args: Any, main_func: Optional[str] = N
     # This is needed because mljob_launcher.py is now in /mnt/job_stage/system
     # but user scripts are in the payload directory and may import from each other
     payload_dir = os.environ.get(PAYLOAD_DIR_ENV_VAR)
+    if payload_dir and not os.path.isabs(payload_dir):
+        payload_dir = os.path.join(STAGE_MOUNT_PATH, payload_dir)
     if payload_dir and payload_dir not in sys.path:
         sys.path.insert(0, payload_dir)
     # Create a Snowpark session before running the script
     # Session can be retrieved from using snowflake.snowpark.context.get_active_session()
-    session = Session.builder.configs(SnowflakeLoginOptions()).create()  # noqa: F841
+    config = SnowflakeLoginOptions()
+    config["client_session_keep_alive"] = "True"
+    session = Session.builder.configs(config).create()  # noqa: F841
     try:
@@ -259,6 +266,7 @@ def run_script(script_path: str, *script_args: Any, main_func: Optional[str] = N
     finally:
         # Restore original sys.argv
         sys.argv = original_argv
+        session.close()
 def main(script_path: str, *script_args: Any, script_main_func: Optional[str] = None) -> ExecutionResult:
@@ -276,9 +284,19 @@ def main(script_path: str, *script_args: Any, script_main_func: Optional[str] =
         Exception: Re-raises any exception caught during script execution.
     """
     # Ensure the output directory exists before trying to write result files.
-    output_dir = os.path.dirname(JOB_RESULT_PATH)
+    result_abs_path = (
+        JOB_RESULT_PATH if os.path.isabs(JOB_RESULT_PATH) else os.path.join(STAGE_MOUNT_PATH, JOB_RESULT_PATH)
+    )
+    output_dir = os.path.dirname(result_abs_path)
     os.makedirs(output_dir, exist_ok=True)
+    try:
+        import ray
+        ray.init(address="auto")
+    except ModuleNotFoundError:
+        warnings.warn("Ray is not installed, skipping Ray initialization", ImportWarning, stacklevel=1)
     try:
         # Wait for minimum required instances if specified
         min_instances_str = os.environ.get(MIN_INSTANCES_ENV_VAR) or "1"
@@ -317,7 +335,7 @@ def main(script_path: str, *script_args: Any, script_main_func: Optional[str] =
         result_dict = result_obj.to_dict()
         try:
             # Serialize result using cloudpickle
-            result_pickle_path = JOB_RESULT_PATH
+            result_pickle_path = result_abs_path
             with open(result_pickle_path, "wb") as f:
                 cloudpickle.dump(result_dict, f)  # Pickle dictionary form for compatibility
         except Exception as pkl_exc:
@@ -326,7 +344,7 @@ def main(script_path: str, *script_args: Any, script_main_func: Optional[str] =
         try:
             # Serialize result to JSON as fallback path in case of cross version incompatibility
             # TODO: Manually convert non-serializable types to strings
-            result_json_path = os.path.splitext(JOB_RESULT_PATH)[0] + ".json"
+            result_json_path = os.path.splitext(result_abs_path)[0] + ".json"
             with open(result_json_path, "w") as f:
                 json.dump(result_dict, f, indent=2, cls=SimpleJSONEncoder)
         except Exception as json_exc:

snowflake/ml/jobs/_utils/spec_utils.py CHANGED Viewed

@@ -7,7 +7,7 @@ from typing import Any, Literal, Optional, Union
 from snowflake import snowpark
 from snowflake.ml._internal.utils import snowflake_env
-from snowflake.ml.jobs._utils import constants, query_helper, types
+from snowflake.ml.jobs._utils import constants, feature_flags, query_helper, types
 from snowflake.ml.jobs._utils.runtime_env_utils import RuntimeEnvironmentsDict
@@ -63,7 +63,7 @@ def _get_image_spec(session: snowpark.Session, compute_pool: str) -> types.Image
     # Use MLRuntime image
     hardware = "GPU" if resources.gpu > 0 else "CPU"
     container_image = None
-    if os.environ.get(constants.ENABLE_IMAGE_VERSION_ENV_VAR, "").lower() == "true":
+    if feature_flags.FeatureFlags.ENABLE_IMAGE_VERSION_ENV_VAR.is_enabled():
         container_image = _get_runtime_image(session, hardware)  # type: ignore[arg-type]
     if not container_image:
@@ -98,6 +98,7 @@ def generate_spec_overrides(
     container_spec: dict[str, Any] = {
         "name": constants.DEFAULT_CONTAINER_NAME,
     }
     if environment_vars:
         # TODO: Validate environment variables
         container_spec["env"] = environment_vars
@@ -213,10 +214,7 @@ def generate_service_spec(
     # TODO: Add hooks for endpoints for integration with TensorBoard etc
-    env_vars = {
-        constants.PAYLOAD_DIR_ENV_VAR: constants.APP_MOUNT_PATH,
-        constants.RESULT_PATH_ENV_VAR: constants.RESULT_PATH_DEFAULT_VALUE,
-    }
+    env_vars = payload.env_vars
     endpoints: list[dict[str, Any]] = []
     if target_instances > 1:

snowflake/ml/jobs/_utils/types.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from pathlib import PurePath
 from typing import Iterator, Literal, Optional, Protocol, Union, runtime_checkable
@@ -90,6 +90,7 @@ class UploadedPayload:
     # TODO: Include manifest of payload files for validation
     stage_path: PurePath
     entrypoint: list[Union[str, PurePath]]
+    env_vars: dict[str, str] = field(default_factory=dict)
 @dataclass(frozen=True)

snowflake/ml/jobs/job.py CHANGED Viewed

@@ -50,7 +50,7 @@ class MLJob(Generic[T], SerializableSessionMixin):
     def min_instances(self) -> int:
         try:
             return int(self._container_spec["env"].get(constants.MIN_INSTANCES_ENV_VAR, 1))
-        except TypeError:
+        except (TypeError, ValueError):
             return 1
     @property
@@ -83,7 +83,10 @@ class MLJob(Generic[T], SerializableSessionMixin):
     def _container_spec(self) -> dict[str, Any]:
         """Get the job's main container spec."""
         containers = self._service_spec["spec"]["containers"]
-        container_spec = next(c for c in containers if c["name"] == constants.DEFAULT_CONTAINER_NAME)
+        try:
+            container_spec = next(c for c in containers if c["name"] == constants.DEFAULT_CONTAINER_NAME)
+        except StopIteration:
+            raise ValueError(f"Container '{constants.DEFAULT_CONTAINER_NAME}' not found in job {self.name}")
         return cast(dict[str, Any], container_spec)
     @property
@@ -99,21 +102,23 @@ class MLJob(Generic[T], SerializableSessionMixin):
         result_path_str = self._container_spec["env"].get(constants.RESULT_PATH_ENV_VAR)
         if result_path_str is None:
             raise RuntimeError(f"Job {self.name} doesn't have a result path configured")
-        volume_mounts = self._container_spec["volumeMounts"]
-        stage_mount_str = next(v for v in volume_mounts if v.get("name") == constants.STAGE_VOLUME_NAME)["mountPath"]
+        # If result path is relative, it is relative to the stage mount path
         result_path = Path(result_path_str)
+        if not result_path.is_absolute():
+            return f"{self._stage_path}/{result_path.as_posix()}"
+        # If result path is absolute, it is relative to the stage mount path
+        volume_mounts = self._container_spec["volumeMounts"]
+        stage_mount_str = next(v for v in volume_mounts if v.get("name") == constants.STAGE_VOLUME_NAME)["mountPath"]
         stage_mount = Path(stage_mount_str)
         try:
             relative_path = result_path.relative_to(stage_mount)
+            return f"{self._stage_path}/{relative_path.as_posix()}"
         except ValueError:
-            if result_path.is_absolute():
-                raise ValueError(
-                    f"Result path {result_path} is absolute, but should be relative to stage mount {stage_mount}"
-                )
-            relative_path = result_path
-        return f"{self._stage_path}/{relative_path.as_posix()}"
+            raise ValueError(
+                f"Result path {result_path} is absolute, but should be relative to stage mount {stage_mount}"
+            )
     @overload
     def get_logs(
@@ -419,15 +424,29 @@ def _get_head_instance_id(session: snowpark.Session, job_id: str) -> Optional[in
     if not rows:
         return None
-    if target_instances > len(rows):
-        raise RuntimeError("Couldn’t retrieve head instance due to missing instances.")
+    # we have already integrated with first_instance startup policy,
+    # the instance 0 is guaranteed to be the head instance
+    head_instance = next(
+        (
+            row
+            for row in rows
+            if "instance_id" in row and row["instance_id"] is not None and int(row["instance_id"]) == 0
+        ),
+        None,
+    )
+    # fallback to find the first instance if the instance 0 is not found
+    if not head_instance:
+        if target_instances > len(rows):
+            raise RuntimeError(
+                f"Couldn’t retrieve head instance due to missing instances. {target_instances} > {len(rows)}"
+            )
+        # Sort by start_time first, then by instance_id
+        try:
+            sorted_instances = sorted(rows, key=lambda x: (x["start_time"], int(x["instance_id"])))
+        except TypeError:
+            raise RuntimeError("Job instance information unavailable.")
+        head_instance = sorted_instances[0]
-    # Sort by start_time first, then by instance_id
-    try:
-        sorted_instances = sorted(rows, key=lambda x: (x["start_time"], int(x["instance_id"])))
-    except TypeError:
-        raise RuntimeError("Job instance information unavailable.")
-    head_instance = sorted_instances[0]
     if not head_instance["start_time"]:
         # If head instance hasn't started yet, return None
         return None

snowflake-ml-python 1.11.0__py3-none-any.whl → 1.13.0__py3-none-any.whl

snowflake-ml-python 1.11.0py3-none-any.whl → 1.13.0py3-none-any.whl