PyPI - snowflake-ml-python - Versions diffs - 1.7.4__py3-none-any.whl → 1.7.5__py3-none-any.whl - Mend

snowflake-ml-python 1.7.4py3-none-any.whl → 1.7.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

snowflake/ml/_internal/env_utils.py CHANGED Viewed

@@ -12,7 +12,7 @@ import yaml
 from packaging import requirements, specifiers, version
 import snowflake.connector
-from snowflake.ml._internal import env as snowml_env
+from snowflake.ml._internal import env as snowml_env, relax_version_strategy
 from snowflake.ml._internal.utils import query_result_checker
 from snowflake.snowpark import context, exceptions, session
@@ -56,6 +56,8 @@ def _validate_pip_requirement_string(req_str: str) -> requirements.Requirement:
         if r.name == "python":
             raise ValueError("Don't specify python as a dependency, use python version argument instead.")
+        if r.name == "cuda":
+            raise ValueError("Don't specify cuda as a dependency, use cuda version argument instead.")
     except requirements.InvalidRequirement:
         raise ValueError(f"Invalid package requirement {req_str} found.")
@@ -313,19 +315,14 @@ def get_package_spec_with_supported_ops_only(req: requirements.Requirement) -> r
     return new_req
-def relax_requirement_version(req: requirements.Requirement) -> requirements.Requirement:
-    """Relax version specifier from a requirement. It detects any ==x.y.z in specifiers and replaced with
-    >=x.y, <(x+1)
-    Args:
-        req: The requirement that version specifier to be removed.
-    Returns:
-        A new requirement object after relaxations.
-    """
-    new_req = copy.deepcopy(req)
+def _relax_specifier_set(
+    specifier_set: specifiers.SpecifierSet, strategy: relax_version_strategy.RelaxVersionStrategy
+) -> specifiers.SpecifierSet:
+    if strategy == relax_version_strategy.RelaxVersionStrategy.NO_RELAX:
+        return specifier_set
+    specifier_set = copy.deepcopy(specifier_set)
     relaxed_specifier_set = set()
-    for spec in new_req.specifier._specs:
+    for spec in specifier_set._specs:
         if spec.operator != "==":
             relaxed_specifier_set.add(spec)
             continue
@@ -337,9 +334,40 @@ def relax_requirement_version(req: requirements.Requirement) -> requirements.Req
             relaxed_specifier_set.add(spec)
             continue
         assert pinned_version is not None
-        relaxed_specifier_set.add(specifiers.Specifier(f">={pinned_version.major}.{pinned_version.minor}"))
-        relaxed_specifier_set.add(specifiers.Specifier(f"<{pinned_version.major + 1}"))
-    new_req.specifier._specs = frozenset(relaxed_specifier_set)
+        if strategy == relax_version_strategy.RelaxVersionStrategy.PATCH:
+            relaxed_specifier_set.add(specifiers.Specifier(f">={pinned_version.major}.{pinned_version.minor}"))
+            relaxed_specifier_set.add(specifiers.Specifier(f"<{pinned_version.major}.{pinned_version.minor+1}"))
+        elif strategy == relax_version_strategy.RelaxVersionStrategy.MINOR:
+            relaxed_specifier_set.add(specifiers.Specifier(f">={pinned_version.major}.{pinned_version.minor}"))
+            relaxed_specifier_set.add(specifiers.Specifier(f"<{pinned_version.major + 1}"))
+        elif strategy == relax_version_strategy.RelaxVersionStrategy.MAJOR:
+            relaxed_specifier_set.add(specifiers.Specifier(f">={pinned_version.major}"))
+            relaxed_specifier_set.add(specifiers.Specifier(f"<{pinned_version.major + 1}"))
+    specifier_set._specs = frozenset(relaxed_specifier_set)
+    return specifier_set
+def relax_requirement_version(req: requirements.Requirement) -> requirements.Requirement:
+    """Relax version specifier from a requirement. It detects any ==x.y.z in specifiers and replaced with relaxed
+    version specifier based on the strategy defined in RELAX_VERSION_STRATEGY_MAP.
+    NO_RELAX: No relaxation.
+    PATCH: >=x.y, <x.(y+1)
+    MINOR (default): >=x.y, <(x+1)
+    MAJOR: >=x, <(x+1)
+    Args:
+        req: The requirement that version specifier to be removed.
+    Returns:
+        A new requirement object after relaxations.
+    """
+    new_req = copy.deepcopy(req)
+    strategy = relax_version_strategy.RELAX_VERSION_STRATEGY_MAP.get(
+        req.name, relax_version_strategy.RelaxVersionStrategy.MINOR
+    )
+    new_req.specifier = _relax_specifier_set(new_req.specifier, strategy)
     return new_req
@@ -431,10 +459,11 @@ def save_conda_env_file(
     path: pathlib.Path,
     conda_chan_deps: DefaultDict[str, List[requirements.Requirement]],
     python_version: str,
+    cuda_version: Optional[str] = None,
     default_channel_override: str = SNOWFLAKE_CONDA_CHANNEL_URL,
 ) -> None:
     """Generate conda.yml file given a dict of dependencies after validation.
-    The channels part of conda.yml file will contains Snowflake Anaconda Channel, nodefaults and all channel names
+    The channels part of conda.yml file will contain Snowflake Anaconda Channel, nodefaults and all channel names
     in keys of the dict, ordered by the number of the packages which belongs to.
     The dependencies part of conda.yml file will contains requirements specifications. If the requirements is in the
     value list whose key is DEFAULT_CHANNEL_NAME, then the channel won't be specified explicitly. Otherwise, it will be
@@ -443,7 +472,8 @@ def save_conda_env_file(
     Args:
         path: Path to the conda.yml file.
         conda_chan_deps: Dict of conda dependencies after validated.
-        python_version: A string 'major.minor' showing python version relate to model.
+        python_version: A string 'major.minor' for the model's python version.
+        cuda_version: A string 'major.minor' for the model's cuda version.
         default_channel_override: The default channel to be put in the first place of the channels section.
     """
     assert path.suffix in [".yml", ".yaml"], "Conda environment file should have extension of yml or yaml."
@@ -461,6 +491,10 @@ def save_conda_env_file(
     env["channels"] = [default_channel_override] + channels + [_NODEFAULTS]
     env["dependencies"] = [f"python=={python_version}.*"]
+    if cuda_version is not None:
+        env["dependencies"].extend([f"nvidia::cuda=={cuda_version}.*"])
     for chan, reqs in conda_chan_deps.items():
         env["dependencies"].extend(
             [f"{chan}::{str(req)}" if chan != DEFAULT_CHANNEL_NAME else str(req) for req in reqs]
@@ -487,7 +521,12 @@ def save_requirements_file(path: pathlib.Path, pip_deps: List[requirements.Requi
 def load_conda_env_file(
     path: pathlib.Path,
-) -> Tuple[DefaultDict[str, List[requirements.Requirement]], Optional[List[requirements.Requirement]], Optional[str]]:
+) -> Tuple[
+    DefaultDict[str, List[requirements.Requirement]],
+    Optional[List[requirements.Requirement]],
+    Optional[str],
+    Optional[str],
+]:
     """Read conda.yml file to get a dict of dependencies after validation.
     The channels part of conda.yml file will be processed with following rules:
     1. If it is Snowflake Anaconda Channel, ignore as it is default.
@@ -515,7 +554,7 @@ def load_conda_env_file(
         and a string 'major.minor.patchlevel' of python version.
     """
     if not path.exists():
-        return collections.defaultdict(list), None, None
+        return collections.defaultdict(list), None, None, None
     with open(path, encoding="utf-8") as f:
         env = yaml.safe_load(stream=f)
@@ -526,6 +565,7 @@ def load_conda_env_file(
     pip_deps = []
     python_version = None
+    cuda_version = None
     channels = env.get("channels", [])
     if len(channels) >= 1:
@@ -541,6 +581,9 @@ def load_conda_env_file(
             # ver is str: python w/ specifier
             if ver:
                 python_version = ver
+            elif dep.startswith("nvidia::cuda"):
+                r = requirements.Requirement(dep.split("nvidia::")[1])
+                cuda_version = list(r.specifier)[0].version.strip(".*")
             elif ver is None:
                 deps.append(dep)
         elif isinstance(dep, dict) and "pip" in dep:
@@ -555,7 +598,7 @@ def load_conda_env_file(
         if channel not in conda_dep_dict:
             conda_dep_dict[channel] = []
-    return conda_dep_dict, pip_deps_list if pip_deps_list else None, python_version
+    return conda_dep_dict, pip_deps_list if pip_deps_list else None, python_version, cuda_version
 def load_requirements_file(path: pathlib.Path) -> List[requirements.Requirement]:

snowflake/ml/_internal/relax_version_strategy.py ADDED Viewed

@@ -0,0 +1,16 @@
+from enum import Enum
+class RelaxVersionStrategy(Enum):
+    NO_RELAX = "no_relax"
+    PATCH = "patch"
+    MINOR = "minor"
+    MAJOR = "major"
+RELAX_VERSION_STRATEGY_MAP = {
+    # The version of cloudpickle should not be relaxed as it is used for serialization.
+    "cloudpickle": RelaxVersionStrategy.NO_RELAX,
+    # The version of scikit-learn should be relaxed only in patch version as it has breaking changes in minor version.
+    "scikit-learn": RelaxVersionStrategy.PATCH,
+}

snowflake/ml/_internal/telemetry.py CHANGED Viewed

@@ -4,6 +4,9 @@ import enum
 import functools
 import inspect
 import operator
+import sys
+import time
+import traceback
 import types
 from typing import (
     Any,
@@ -75,6 +78,8 @@ class TelemetryField(enum.Enum):
     KEY_FUNC_PARAMS = "func_params"
     KEY_ERROR_INFO = "error_info"
     KEY_ERROR_CODE = "error_code"
+    KEY_STACK_TRACE = "stack_trace"
+    KEY_DURATION = "duration"
     KEY_VERSION = "version"
     KEY_PYTHON_VERSION = "python_version"
     KEY_OS = "operating_system"
@@ -435,6 +440,7 @@ def send_api_usage_telemetry(
     # noqa: DAR402
     """
+    start_time = time.perf_counter()
     if subproject is not None and subproject_extractor is not None:
         raise ValueError("Specifying both subproject and subproject_extractor is not allowed")
@@ -555,8 +561,16 @@ def send_api_usage_telemetry(
                         )
                 else:
                     me = e
                 telemetry_args["error"] = repr(me)
                 telemetry_args["error_code"] = me.error_code
+                # exclude telemetry frames
+                excluded_frames = 2
+                tb = traceback.extract_tb(sys.exc_info()[2])
+                formatted_tb = "".join(traceback.format_list(tb[excluded_frames:]))
+                formatted_exception = traceback.format_exception_only(*sys.exc_info()[:2])[0]  # error type + message
+                telemetry_args["stack_trace"] = formatted_tb + formatted_exception
                 me.original_exception._snowflake_ml_handled = True  # type: ignore[attr-defined]
                 if e is not me:
                     raise  # Directly raise non-wrapped exceptions to preserve original stacktrace
@@ -565,6 +579,7 @@ def send_api_usage_telemetry(
                 else:
                     raise me.original_exception from e
             finally:
+                telemetry_args["duration"] = time.perf_counter() - start_time  # type: ignore[assignment]
                 telemetry.send_function_usage_telemetry(**telemetry_args)
                 global _log_counter
                 _log_counter += 1
@@ -718,12 +733,14 @@ class _SourceTelemetryClient:
         self,
         func_name: str,
         function_category: str,
+        duration: float,
         func_params: Optional[Dict[str, Any]] = None,
         api_calls: Optional[List[Dict[str, Any]]] = None,
         sfqids: Optional[List[Any]] = None,
         custom_tags: Optional[Dict[str, Union[bool, int, str, float]]] = None,
         error: Optional[str] = None,
         error_code: Optional[str] = None,
+        stack_trace: Optional[str] = None,
     ) -> None:
         """
         Send function usage telemetry message.
@@ -731,12 +748,14 @@ class _SourceTelemetryClient:
         Args:
             func_name: Function name.
             function_category: Function category.
+            duration: Function duration.
             func_params: Function parameters.
             api_calls: API calls.
             sfqids: Snowflake query IDs.
             custom_tags: Custom tags.
             error: Error.
             error_code: Error code.
+            stack_trace: Error stack trace.
         """
         data: Dict[str, Any] = {
             TelemetryField.KEY_FUNC_NAME.value: func_name,
@@ -755,11 +774,13 @@ class _SourceTelemetryClient:
         message: Dict[str, Any] = {
             **self._create_basic_telemetry_data(telemetry_type),
             TelemetryField.KEY_DATA.value: data,
+            TelemetryField.KEY_DURATION.value: duration,
         }
         if error:
             message[TelemetryField.KEY_ERROR_INFO.value] = error
             message[TelemetryField.KEY_ERROR_CODE.value] = error_code
+            message[TelemetryField.KEY_STACK_TRACE.value] = stack_trace
         self._send(message)

snowflake/ml/data/_internal/arrow_ingestor.py CHANGED Viewed

@@ -116,7 +116,7 @@ class ArrowIngestor(data_ingestor.DataIngestor):
     def to_pandas(self, limit: Optional[int] = None) -> pd.DataFrame:
         ds = self._get_dataset(shuffle=False)
         table = ds.to_table() if limit is None else ds.head(num_rows=limit)
-        return table.to_pandas()
+        return table.to_pandas(split_blocks=True, self_destruct=True)
     def _get_dataset(self, shuffle: bool) -> pds.Dataset:
         format = self._format

snowflake/ml/feature_store/feature_store.py CHANGED Viewed

@@ -144,6 +144,7 @@ _LIST_FEATURE_VIEW_SCHEMA = StructType(
         StructField("refresh_mode", StringType()),
         StructField("scheduling_state", StringType()),
         StructField("warehouse", StringType()),
+        StructField("cluster_by", StringType()),
     ]
 )
@@ -1832,6 +1833,12 @@ class FeatureStore:
                 WAREHOUSE = {warehouse}
                 REFRESH_MODE = {feature_view.refresh_mode}
                 INITIALIZE = {feature_view.initialize}
+            """
+            if feature_view.cluster_by:
+                cluster_by_clause = f"CLUSTER BY ({', '.join(feature_view.cluster_by)})"
+                query += f"{cluster_by_clause}"
+            query += f"""
                 AS {feature_view.query}
             """
             self._session.sql(query).collect(block=block, statement_params=self._telemetry_stmp)
@@ -2249,6 +2256,7 @@ class FeatureStore:
         values.append(row["refresh_mode"] if "refresh_mode" in row else None)
         values.append(row["scheduling_state"] if "scheduling_state" in row else None)
         values.append(row["warehouse"] if "warehouse" in row else None)
+        values.append(json.dumps(self._extract_cluster_by_columns(row["cluster_by"])) if "cluster_by" in row else None)
         output_values.append(values)
     def _lookup_feature_view_metadata(self, row: Row, fv_name: str) -> Tuple[_FeatureViewMetadata, str]:
@@ -2335,6 +2343,7 @@ class FeatureStore:
                 owner=row["owner"],
                 infer_schema_df=infer_schema_df,
                 session=self._session,
+                cluster_by=self._extract_cluster_by_columns(row["cluster_by"]),
             )
             return fv
         else:
@@ -2625,3 +2634,12 @@ class FeatureStore:
             )
         return feature_view
+    @staticmethod
+    def _extract_cluster_by_columns(cluster_by_clause: str) -> List[str]:
+        # Use regex to extract elements inside the parentheses.
+        match = re.search(r"\((.*?)\)", cluster_by_clause)
+        if match:
+            # Handle both quoted and unquoted column names.
+            return re.findall(identifier.SF_IDENTIFIER_RE, match.group(1))
+        return []

snowflake/ml/feature_store/feature_view.py CHANGED Viewed

@@ -170,6 +170,7 @@ class FeatureView(lineage_node.LineageNode):
         warehouse: Optional[str] = None,
         initialize: str = "ON_CREATE",
         refresh_mode: str = "AUTO",
+        cluster_by: Optional[List[str]] = None,
         **_kwargs: Any,
     ) -> None:
         """
@@ -200,6 +201,9 @@ class FeatureView(lineage_node.LineageNode):
             refresh_mode: The refresh mode of managed feature view. The value can be 'AUTO', 'FULL' or 'INCREMENETAL'.
                 For managed feature view, the default value is 'AUTO'. For static feature view it has no effect.
                 Check https://docs.snowflake.com/en/sql-reference/sql/create-dynamic-table for for details.
+            cluster_by: Columns to cluster the feature view by.
+                - Defaults to the join keys from entities.
+                - If `timestamp_col` is provided, it is added to the default clustering keys.
             _kwargs: reserved kwargs for system generated args. NOTE: DO NOT USE.
         Example::
@@ -224,6 +228,7 @@ class FeatureView(lineage_node.LineageNode):
             >>> print(registered_fv.status)
             FeatureViewStatus.ACTIVE
+        # noqa: DAR401
         """
         self._name: SqlIdentifier = SqlIdentifier(name)
@@ -233,7 +238,7 @@ class FeatureView(lineage_node.LineageNode):
             SqlIdentifier(timestamp_col) if timestamp_col is not None else None
         )
         self._desc: str = desc
-        self._infer_schema_df: DataFrame = _kwargs.get("_infer_schema_df", self._feature_df)
+        self._infer_schema_df: DataFrame = _kwargs.pop("_infer_schema_df", self._feature_df)
         self._query: str = self._get_query()
         self._version: Optional[FeatureViewVersion] = None
         self._status: FeatureViewStatus = FeatureViewStatus.DRAFT
@@ -249,6 +254,14 @@ class FeatureView(lineage_node.LineageNode):
         self._refresh_mode: Optional[str] = refresh_mode
         self._refresh_mode_reason: Optional[str] = None
         self._owner: Optional[str] = None
+        self._cluster_by: List[SqlIdentifier] = (
+            [SqlIdentifier(col) for col in cluster_by] if cluster_by is not None else self._get_default_cluster_by()
+        )
+        # Validate kwargs
+        if _kwargs:
+            raise TypeError(f"FeatureView.__init__ got an unexpected keyword argument: '{next(iter(_kwargs.keys()))}'")
         self._validate()
     def slice(self, names: List[str]) -> FeatureViewSlice:
@@ -394,6 +407,10 @@ class FeatureView(lineage_node.LineageNode):
     def timestamp_col(self) -> Optional[SqlIdentifier]:
         return self._timestamp_col
+    @property
+    def cluster_by(self) -> Optional[List[SqlIdentifier]]:
+        return self._cluster_by
     @property
     def desc(self) -> str:
         return self._desc
@@ -656,6 +673,14 @@ Got {len(self._feature_df.queries['queries'])}: {self._feature_df.queries['queri
                 if not isinstance(col_type, (DateType, TimeType, TimestampType, _NumericType)):
                     raise ValueError(f"Invalid data type for timestamp_col {ts_col}: {col_type}.")
+            if self.cluster_by is not None:
+                for column in self.cluster_by:
+                    if column not in df_cols:
+                        raise ValueError(
+                            f"Column '{column}' in `cluster_by` is not in the feature DataFrame schema. "
+                            f"{df_cols}, {self.cluster_by}"
+                        )
         if re.match(_RESULT_SCAN_QUERY_PATTERN, self._query) is not None:
             raise ValueError(f"feature_df should not be reading from RESULT_SCAN. Invalid query: {self._query}")
@@ -890,6 +915,7 @@ Got {len(self._feature_df.queries['queries'])}: {self._feature_df.queries['queri
         owner: Optional[str],
         infer_schema_df: Optional[DataFrame],
         session: Session,
+        cluster_by: Optional[List[str]] = None,
     ) -> FeatureView:
         fv = FeatureView(
             name=name,
@@ -898,6 +924,7 @@ Got {len(self._feature_df.queries['queries'])}: {self._feature_df.queries['queri
             timestamp_col=timestamp_col,
             desc=desc,
             _infer_schema_df=infer_schema_df,
+            cluster_by=cluster_by,
         )
         fv._version = FeatureViewVersion(version) if version is not None else None
         fv._status = status
@@ -916,5 +943,23 @@ Got {len(self._feature_df.queries['queries'])}: {self._feature_df.queries['queri
         )
         return fv
+    #
+    def _get_default_cluster_by(self) -> List[SqlIdentifier]:
+        """
+        Get default columns to cluster the feature view by.
+        Default cluster_by columns are join keys from entities and timestamp_col if it exists
+        Returns:
+            List of SqlIdentifiers representing the default columns to cluster the feature view by.
+        """
+        # We don't focus on the order of entities here, as users can define a custom 'cluster_by'
+        # if a specific order is required.
+        default_cluster_by_cols = [key for entity in self.entities if entity.join_keys for key in entity.join_keys]
+        if self.timestamp_col:
+            default_cluster_by_cols.append(self.timestamp_col)
+        return default_cluster_by_cols
 lineage_node.DOMAIN_LINEAGE_REGISTRY["feature_view"] = FeatureView

snowflake/ml/jobs/_utils/constants.py CHANGED Viewed

@@ -4,12 +4,15 @@ from snowflake.ml.jobs._utils.types import ComputeResources
 # SPCS specification constants
 DEFAULT_CONTAINER_NAME = "main"
 PAYLOAD_DIR_ENV_VAR = "MLRS_PAYLOAD_DIR"
+MEMORY_VOLUME_NAME = "dshm"
+STAGE_VOLUME_NAME = "stage-volume"
+STAGE_VOLUME_MOUNT_PATH = "/mnt/app"
 # Default container image information
 DEFAULT_IMAGE_REPO = "/snowflake/images/snowflake_images"
 DEFAULT_IMAGE_CPU = "st_plat/runtime/x86/runtime_image/snowbooks"
 DEFAULT_IMAGE_GPU = "st_plat/runtime/x86/generic_gpu/runtime_image/snowbooks"
-DEFAULT_IMAGE_TAG = "0.8.0"
+DEFAULT_IMAGE_TAG = "0.9.2"
 DEFAULT_ENTRYPOINT_PATH = "func.py"
 # Percent of container memory to allocate for /dev/shm volume
@@ -19,6 +22,9 @@ MEMORY_VOLUME_SIZE = 0.3
 JOB_POLL_INITIAL_DELAY_SECONDS = 0.1
 JOB_POLL_MAX_DELAY_SECONDS = 1
+# Magic attributes
+IS_MLJOB_REMOTE_ATTR = "_is_mljob_remote_callable"
 # Compute pool resource information
 # TODO: Query Snowflake for resource information instead of relying on this hardcoded
 #       table from https://docs.snowflake.com/en/sql-reference/sql/create-compute-pool

snowflake-ml-python 1.7.4__py3-none-any.whl → 1.7.5__py3-none-any.whl

snowflake-ml-python 1.7.4py3-none-any.whl → 1.7.5py3-none-any.whl