PyPI - snowflake-ml-python - Versions diffs - 1.7.4__py3-none-any.whl → 1.8.0__py3-none-any.whl - Mend

snowflake-ml-python 1.7.4py3-none-any.whl → 1.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

snowflake/cortex/_complete.py CHANGED Viewed

@@ -23,6 +23,15 @@ logger = logging.getLogger(__name__)
 _REST_COMPLETE_URL = "/api/v2/cortex/inference:complete"
+class ResponseFormat(TypedDict):
+    """Represents an object describing response format config for structured-output mode"""
+    type: str
+    """The response format type (e.g. "json")"""
+    schema: Dict[str, Any]
+    """The schema defining the structure of the response. For json it should be a valid json schema object"""
 class ConversationMessage(TypedDict):
     """Represents an conversation interaction."""
@@ -53,6 +62,9 @@ class CompleteOptions(TypedDict):
     """ A boolean value that controls whether Cortex Guard filters unsafe or harmful responses
     from the language model. """
+    response_format: NotRequired[ResponseFormat]
+    """ An object describing response format config for structured-output mode """
 class ResponseParseException(Exception):
     """This exception is raised when the server response cannot be parsed."""
@@ -108,6 +120,32 @@ def _make_common_request_headers() -> Dict[str, str]:
     return headers
+def _validate_response_format_object(options: CompleteOptions) -> None:
+    """Validate the response format object for structured-output mode.
+    More details can be found in:
+    docs.snowflake.com/en/user-guide/snowflake-cortex/complete-structured-outputs#using-complete-structured-outputs
+    Args:
+        options: The complete options object.
+    Raises:
+        ValueError: If the response format object is invalid or missing required fields.
+    """
+    if options is not None and options.get("response_format") is not None:
+        options_obj = options.get("response_format")
+        if not isinstance(options_obj, dict):
+            raise ValueError("'response_format' should be an object")
+        if options_obj.get("type") is None:
+            raise ValueError("'type' cannot be empty for 'response_format' object")
+        if not isinstance(options_obj.get("type"), str):
+            raise ValueError("'type' needs to be a str for 'response_format' object")
+        if options_obj.get("schema") is None:
+            raise ValueError("'schema' cannot be empty for 'response_format' object")
+        if not isinstance(options_obj.get("schema"), dict):
+            raise ValueError("'schema' needs to be a dict for 'response_format' object")
 def _make_request_body(
     model: str,
     prompt: Union[str, List[ConversationMessage]],
@@ -136,12 +174,16 @@ def _make_request_body(
                 "response_when_unsafe": "Response filtered by Cortex Guard",
             }
             data["guardrails"] = guardrails_options
+        if "response_format" in options:
+            data["response_format"] = options["response_format"]
     return data
 # XP endpoint returns a dict response which needs to be converted to a format which can
 # be consumed by the SSEClient. This method does that.
 def _xp_dict_to_response(raw_resp: Dict[str, Any]) -> requests.Response:
     response = requests.Response()
     response.status_code = int(raw_resp["status"])
     response.headers = raw_resp["headers"]
@@ -159,7 +201,6 @@ def _xp_dict_to_response(raw_resp: Dict[str, Any]) -> requests.Response:
         data = json.loads(data)
     except json.JSONDecodeError:
         raise ValueError(f"Request failed (request id: {request_id})")
     if response.status_code < 200 or response.status_code >= 300:
         if "message" not in data:
             raise ValueError(f"Request failed (request id: {request_id})")
@@ -241,11 +282,21 @@ def _return_stream_response(response: requests.Response, deadline: Optional[floa
         if deadline is not None and time.time() > deadline:
             raise TimeoutError()
         try:
-            yield json.loads(event.data)["choices"][0]["delta"]["content"]
+            parsed_resp = json.loads(event.data)
+        except json.JSONDecodeError:
+            raise ResponseParseException("Server response cannot be parsed")
+        try:
+            yield parsed_resp["choices"][0]["delta"]["content"]
         except (json.JSONDecodeError, KeyError, IndexError):
             # For the sake of evolution of the output format,
             # ignore stream messages that don't match the expected format.
-            pass
+            # This is the case of midstream errors which were introduced specifically for structured output.
+            # TODO: discuss during code review
+            if parsed_resp.get("error"):
+                yield json.dumps(parsed_resp)
+            else:
+                pass
 def _complete_call_sql_function_snowpark(
@@ -291,6 +342,8 @@ def _complete_non_streaming_impl(
         raise ValueError("'model' cannot be a snowpark.Column when 'prompt' is a string.")
     if isinstance(options, snowpark.Column):
         raise ValueError("'options' cannot be a snowpark.Column when 'prompt' is a string.")
+    if options and not isinstance(options, snowpark.Column):
+        _validate_response_format_object(options)
     return _complete_non_streaming_immediate(
         snow_api_xp_request_handler=snow_api_xp_request_handler,
         model=model,
@@ -309,6 +362,8 @@ def _complete_rest(
     session: Optional[snowpark.Session] = None,
     deadline: Optional[float] = None,
 ) -> Iterator[str]:
+    if options:
+        _validate_response_format_object(options)
     if snow_api_xp_request_handler is not None:
         response = _call_complete_xp(
             snow_api_xp_request_handler=snow_api_xp_request_handler,

snowflake/ml/_internal/env_utils.py CHANGED Viewed

@@ -12,7 +12,7 @@ import yaml
 from packaging import requirements, specifiers, version
 import snowflake.connector
-from snowflake.ml._internal import env as snowml_env
+from snowflake.ml._internal import env as snowml_env, relax_version_strategy
 from snowflake.ml._internal.utils import query_result_checker
 from snowflake.snowpark import context, exceptions, session
@@ -56,6 +56,8 @@ def _validate_pip_requirement_string(req_str: str) -> requirements.Requirement:
         if r.name == "python":
             raise ValueError("Don't specify python as a dependency, use python version argument instead.")
+        if r.name == "cuda":
+            raise ValueError("Don't specify cuda as a dependency, use cuda version argument instead.")
     except requirements.InvalidRequirement:
         raise ValueError(f"Invalid package requirement {req_str} found.")
@@ -313,19 +315,14 @@ def get_package_spec_with_supported_ops_only(req: requirements.Requirement) -> r
     return new_req
-def relax_requirement_version(req: requirements.Requirement) -> requirements.Requirement:
-    """Relax version specifier from a requirement. It detects any ==x.y.z in specifiers and replaced with
-    >=x.y, <(x+1)
-    Args:
-        req: The requirement that version specifier to be removed.
-    Returns:
-        A new requirement object after relaxations.
-    """
-    new_req = copy.deepcopy(req)
+def _relax_specifier_set(
+    specifier_set: specifiers.SpecifierSet, strategy: relax_version_strategy.RelaxVersionStrategy
+) -> specifiers.SpecifierSet:
+    if strategy == relax_version_strategy.RelaxVersionStrategy.NO_RELAX:
+        return specifier_set
+    specifier_set = copy.deepcopy(specifier_set)
     relaxed_specifier_set = set()
-    for spec in new_req.specifier._specs:
+    for spec in specifier_set._specs:
         if spec.operator != "==":
             relaxed_specifier_set.add(spec)
             continue
@@ -337,9 +334,40 @@ def relax_requirement_version(req: requirements.Requirement) -> requirements.Req
             relaxed_specifier_set.add(spec)
             continue
         assert pinned_version is not None
-        relaxed_specifier_set.add(specifiers.Specifier(f">={pinned_version.major}.{pinned_version.minor}"))
-        relaxed_specifier_set.add(specifiers.Specifier(f"<{pinned_version.major + 1}"))
-    new_req.specifier._specs = frozenset(relaxed_specifier_set)
+        if strategy == relax_version_strategy.RelaxVersionStrategy.PATCH:
+            relaxed_specifier_set.add(specifiers.Specifier(f">={pinned_version.major}.{pinned_version.minor}"))
+            relaxed_specifier_set.add(specifiers.Specifier(f"<{pinned_version.major}.{pinned_version.minor+1}"))
+        elif strategy == relax_version_strategy.RelaxVersionStrategy.MINOR:
+            relaxed_specifier_set.add(specifiers.Specifier(f">={pinned_version.major}.{pinned_version.minor}"))
+            relaxed_specifier_set.add(specifiers.Specifier(f"<{pinned_version.major + 1}"))
+        elif strategy == relax_version_strategy.RelaxVersionStrategy.MAJOR:
+            relaxed_specifier_set.add(specifiers.Specifier(f">={pinned_version.major}"))
+            relaxed_specifier_set.add(specifiers.Specifier(f"<{pinned_version.major + 1}"))
+    specifier_set._specs = frozenset(relaxed_specifier_set)
+    return specifier_set
+def relax_requirement_version(req: requirements.Requirement) -> requirements.Requirement:
+    """Relax version specifier from a requirement. It detects any ==x.y.z in specifiers and replaced with relaxed
+    version specifier based on the strategy defined in RELAX_VERSION_STRATEGY_MAP.
+    NO_RELAX: No relaxation.
+    PATCH: >=x.y, <x.(y+1)
+    MINOR (default): >=x.y, <(x+1)
+    MAJOR: >=x, <(x+1)
+    Args:
+        req: The requirement that version specifier to be removed.
+    Returns:
+        A new requirement object after relaxations.
+    """
+    new_req = copy.deepcopy(req)
+    strategy = relax_version_strategy.RELAX_VERSION_STRATEGY_MAP.get(
+        req.name, relax_version_strategy.RelaxVersionStrategy.MINOR
+    )
+    new_req.specifier = _relax_specifier_set(new_req.specifier, strategy)
     return new_req
@@ -431,10 +459,11 @@ def save_conda_env_file(
     path: pathlib.Path,
     conda_chan_deps: DefaultDict[str, List[requirements.Requirement]],
     python_version: str,
+    cuda_version: Optional[str] = None,
     default_channel_override: str = SNOWFLAKE_CONDA_CHANNEL_URL,
 ) -> None:
     """Generate conda.yml file given a dict of dependencies after validation.
-    The channels part of conda.yml file will contains Snowflake Anaconda Channel, nodefaults and all channel names
+    The channels part of conda.yml file will contain Snowflake Anaconda Channel, nodefaults and all channel names
     in keys of the dict, ordered by the number of the packages which belongs to.
     The dependencies part of conda.yml file will contains requirements specifications. If the requirements is in the
     value list whose key is DEFAULT_CHANNEL_NAME, then the channel won't be specified explicitly. Otherwise, it will be
@@ -443,7 +472,8 @@ def save_conda_env_file(
     Args:
         path: Path to the conda.yml file.
         conda_chan_deps: Dict of conda dependencies after validated.
-        python_version: A string 'major.minor' showing python version relate to model.
+        python_version: A string 'major.minor' for the model's python version.
+        cuda_version: A string 'major.minor' for the model's cuda version.
         default_channel_override: The default channel to be put in the first place of the channels section.
     """
     assert path.suffix in [".yml", ".yaml"], "Conda environment file should have extension of yml or yaml."
@@ -461,6 +491,10 @@ def save_conda_env_file(
     env["channels"] = [default_channel_override] + channels + [_NODEFAULTS]
     env["dependencies"] = [f"python=={python_version}.*"]
+    if cuda_version is not None:
+        env["dependencies"].extend([f"nvidia::cuda=={cuda_version}.*"])
     for chan, reqs in conda_chan_deps.items():
         env["dependencies"].extend(
             [f"{chan}::{str(req)}" if chan != DEFAULT_CHANNEL_NAME else str(req) for req in reqs]
@@ -487,7 +521,12 @@ def save_requirements_file(path: pathlib.Path, pip_deps: List[requirements.Requi
 def load_conda_env_file(
     path: pathlib.Path,
-) -> Tuple[DefaultDict[str, List[requirements.Requirement]], Optional[List[requirements.Requirement]], Optional[str]]:
+) -> Tuple[
+    DefaultDict[str, List[requirements.Requirement]],
+    Optional[List[requirements.Requirement]],
+    Optional[str],
+    Optional[str],
+]:
     """Read conda.yml file to get a dict of dependencies after validation.
     The channels part of conda.yml file will be processed with following rules:
     1. If it is Snowflake Anaconda Channel, ignore as it is default.
@@ -515,7 +554,7 @@ def load_conda_env_file(
         and a string 'major.minor.patchlevel' of python version.
     """
     if not path.exists():
-        return collections.defaultdict(list), None, None
+        return collections.defaultdict(list), None, None, None
     with open(path, encoding="utf-8") as f:
         env = yaml.safe_load(stream=f)
@@ -526,6 +565,7 @@ def load_conda_env_file(
     pip_deps = []
     python_version = None
+    cuda_version = None
     channels = env.get("channels", [])
     if len(channels) >= 1:
@@ -541,6 +581,9 @@ def load_conda_env_file(
             # ver is str: python w/ specifier
             if ver:
                 python_version = ver
+            elif dep.startswith("nvidia::cuda"):
+                r = requirements.Requirement(dep.split("nvidia::")[1])
+                cuda_version = list(r.specifier)[0].version.strip(".*")
             elif ver is None:
                 deps.append(dep)
         elif isinstance(dep, dict) and "pip" in dep:
@@ -555,7 +598,7 @@ def load_conda_env_file(
         if channel not in conda_dep_dict:
             conda_dep_dict[channel] = []
-    return conda_dep_dict, pip_deps_list if pip_deps_list else None, python_version
+    return conda_dep_dict, pip_deps_list if pip_deps_list else None, python_version, cuda_version
 def load_requirements_file(path: pathlib.Path) -> List[requirements.Requirement]:

snowflake/ml/_internal/file_utils.py CHANGED Viewed

@@ -23,6 +23,7 @@ from typing import (
     Tuple,
     Union,
 )
+from urllib import parse
 import cloudpickle
@@ -294,7 +295,7 @@ def _retry_on_sql_error(exception: Exception) -> bool:
 def upload_directory_to_stage(
     session: snowpark.Session,
     local_path: pathlib.Path,
-    stage_path: pathlib.PurePosixPath,
+    stage_path: Union[pathlib.PurePosixPath, parse.ParseResult],
     *,
     statement_params: Optional[Dict[str, Any]] = None,
 ) -> None:
@@ -314,9 +315,22 @@ def upload_directory_to_stage(
         root_path = pathlib.Path(root)
         for filename in filenames:
             local_file_path = root_path / filename
-            stage_dir_path = (
-                stage_path / pathlib.PurePosixPath(local_file_path.relative_to(local_path).as_posix()).parent
-            )
+            relative_path = pathlib.PurePosixPath(local_file_path.relative_to(local_path).as_posix())
+            if isinstance(stage_path, parse.ParseResult):
+                relative_stage_path = (pathlib.PosixPath(stage_path.path) / relative_path).parent
+                new_url = parse.ParseResult(
+                    scheme=stage_path.scheme,
+                    netloc=stage_path.netloc,
+                    path=str(relative_stage_path),
+                    params=stage_path.params,
+                    query=stage_path.query,
+                    fragment=stage_path.fragment,
+                )
+                stage_dir_path = parse.urlunparse(new_url)
+            else:
+                stage_dir_path = str((stage_path / relative_path).parent)
             retrying.retry(
                 retry_on_exception=_retry_on_sql_error,
                 stop_max_attempt_number=5,

snowflake/ml/_internal/platform_capabilities.py CHANGED Viewed

@@ -37,6 +37,9 @@ class PlatformCapabilities:
     def is_nested_function_enabled(self) -> bool:
         return self._get_bool_feature("SPCS_MODEL_ENABLE_EMBEDDED_SERVICE_FUNCTIONS", False)
+    def is_live_commit_enabled(self) -> bool:
+        return self._get_bool_feature("ENABLE_BUNDLE_MODULE_CHECKOUT", False)
     @staticmethod
     def _get_features(session: snowpark_session.Session) -> Dict[str, Any]:
         try:

snowflake/ml/_internal/relax_version_strategy.py ADDED Viewed

@@ -0,0 +1,16 @@
+from enum import Enum
+class RelaxVersionStrategy(Enum):
+    NO_RELAX = "no_relax"
+    PATCH = "patch"
+    MINOR = "minor"
+    MAJOR = "major"
+RELAX_VERSION_STRATEGY_MAP = {
+    # The version of cloudpickle should not be relaxed as it is used for serialization.
+    "cloudpickle": RelaxVersionStrategy.NO_RELAX,
+    # The version of scikit-learn should be relaxed only in patch version as it has breaking changes in minor version.
+    "scikit-learn": RelaxVersionStrategy.PATCH,
+}

snowflake/ml/_internal/telemetry.py CHANGED Viewed

@@ -4,6 +4,9 @@ import enum
 import functools
 import inspect
 import operator
+import sys
+import time
+import traceback
 import types
 from typing import (
     Any,
@@ -75,6 +78,8 @@ class TelemetryField(enum.Enum):
     KEY_FUNC_PARAMS = "func_params"
     KEY_ERROR_INFO = "error_info"
     KEY_ERROR_CODE = "error_code"
+    KEY_STACK_TRACE = "stack_trace"
+    KEY_DURATION = "duration"
     KEY_VERSION = "version"
     KEY_PYTHON_VERSION = "python_version"
     KEY_OS = "operating_system"
@@ -348,6 +353,10 @@ def get_function_usage_statement_params(
                 statement_params[TelemetryField.KEY_API_CALLS.value].append({TelemetryField.NAME.value: api_call})
     if custom_tags:
         statement_params[TelemetryField.KEY_CUSTOM_TAGS.value] = custom_tags
+    # Snowpark doesn't support None value in statement_params from version 1.29
+    for k in statement_params:
+        if statement_params[k] is None:
+            statement_params[k] = ""
     return statement_params
@@ -435,6 +444,7 @@ def send_api_usage_telemetry(
     # noqa: DAR402
     """
+    start_time = time.perf_counter()
     if subproject is not None and subproject_extractor is not None:
         raise ValueError("Specifying both subproject and subproject_extractor is not allowed")
@@ -555,8 +565,16 @@ def send_api_usage_telemetry(
                         )
                 else:
                     me = e
                 telemetry_args["error"] = repr(me)
                 telemetry_args["error_code"] = me.error_code
+                # exclude telemetry frames
+                excluded_frames = 2
+                tb = traceback.extract_tb(sys.exc_info()[2])
+                formatted_tb = "".join(traceback.format_list(tb[excluded_frames:]))
+                formatted_exception = traceback.format_exception_only(*sys.exc_info()[:2])[0]  # error type + message
+                telemetry_args["stack_trace"] = formatted_tb + formatted_exception
                 me.original_exception._snowflake_ml_handled = True  # type: ignore[attr-defined]
                 if e is not me:
                     raise  # Directly raise non-wrapped exceptions to preserve original stacktrace
@@ -565,6 +583,7 @@ def send_api_usage_telemetry(
                 else:
                     raise me.original_exception from e
             finally:
+                telemetry_args["duration"] = time.perf_counter() - start_time  # type: ignore[assignment]
                 telemetry.send_function_usage_telemetry(**telemetry_args)
                 global _log_counter
                 _log_counter += 1
@@ -718,12 +737,14 @@ class _SourceTelemetryClient:
         self,
         func_name: str,
         function_category: str,
+        duration: float,
         func_params: Optional[Dict[str, Any]] = None,
         api_calls: Optional[List[Dict[str, Any]]] = None,
         sfqids: Optional[List[Any]] = None,
         custom_tags: Optional[Dict[str, Union[bool, int, str, float]]] = None,
         error: Optional[str] = None,
         error_code: Optional[str] = None,
+        stack_trace: Optional[str] = None,
     ) -> None:
         """
         Send function usage telemetry message.
@@ -731,12 +752,14 @@ class _SourceTelemetryClient:
         Args:
             func_name: Function name.
             function_category: Function category.
+            duration: Function duration.
             func_params: Function parameters.
             api_calls: API calls.
             sfqids: Snowflake query IDs.
             custom_tags: Custom tags.
             error: Error.
             error_code: Error code.
+            stack_trace: Error stack trace.
         """
         data: Dict[str, Any] = {
             TelemetryField.KEY_FUNC_NAME.value: func_name,
@@ -755,11 +778,13 @@ class _SourceTelemetryClient:
         message: Dict[str, Any] = {
             **self._create_basic_telemetry_data(telemetry_type),
             TelemetryField.KEY_DATA.value: data,
+            TelemetryField.KEY_DURATION.value: duration,
         }
         if error:
             message[TelemetryField.KEY_ERROR_INFO.value] = error
             message[TelemetryField.KEY_ERROR_CODE.value] = error_code
+            message[TelemetryField.KEY_STACK_TRACE.value] = stack_trace
         self._send(message)

snowflake/ml/data/_internal/arrow_ingestor.py CHANGED Viewed

@@ -116,7 +116,7 @@ class ArrowIngestor(data_ingestor.DataIngestor):
     def to_pandas(self, limit: Optional[int] = None) -> pd.DataFrame:
         ds = self._get_dataset(shuffle=False)
         table = ds.to_table() if limit is None else ds.head(num_rows=limit)
-        return table.to_pandas()
+        return table.to_pandas(split_blocks=True, self_destruct=True)
     def _get_dataset(self, shuffle: bool) -> pds.Dataset:
         format = self._format

snowflake/ml/feature_store/feature_store.py CHANGED Viewed

@@ -144,6 +144,7 @@ _LIST_FEATURE_VIEW_SCHEMA = StructType(
         StructField("refresh_mode", StringType()),
         StructField("scheduling_state", StringType()),
         StructField("warehouse", StringType()),
+        StructField("cluster_by", StringType()),
     ]
 )
@@ -1832,6 +1833,12 @@ class FeatureStore:
                 WAREHOUSE = {warehouse}
                 REFRESH_MODE = {feature_view.refresh_mode}
                 INITIALIZE = {feature_view.initialize}
+            """
+            if feature_view.cluster_by:
+                cluster_by_clause = f"CLUSTER BY ({', '.join(feature_view.cluster_by)})"
+                query += f"{cluster_by_clause}"
+            query += f"""
                 AS {feature_view.query}
             """
             self._session.sql(query).collect(block=block, statement_params=self._telemetry_stmp)
@@ -2249,6 +2256,7 @@ class FeatureStore:
         values.append(row["refresh_mode"] if "refresh_mode" in row else None)
         values.append(row["scheduling_state"] if "scheduling_state" in row else None)
         values.append(row["warehouse"] if "warehouse" in row else None)
+        values.append(json.dumps(self._extract_cluster_by_columns(row["cluster_by"])) if "cluster_by" in row else None)
         output_values.append(values)
     def _lookup_feature_view_metadata(self, row: Row, fv_name: str) -> Tuple[_FeatureViewMetadata, str]:
@@ -2335,6 +2343,7 @@ class FeatureStore:
                 owner=row["owner"],
                 infer_schema_df=infer_schema_df,
                 session=self._session,
+                cluster_by=self._extract_cluster_by_columns(row["cluster_by"]),
             )
             return fv
         else:
@@ -2625,3 +2634,12 @@ class FeatureStore:
             )
         return feature_view
+    @staticmethod
+    def _extract_cluster_by_columns(cluster_by_clause: str) -> List[str]:
+        # Use regex to extract elements inside the parentheses.
+        match = re.search(r"\((.*?)\)", cluster_by_clause)
+        if match:
+            # Handle both quoted and unquoted column names.
+            return re.findall(identifier.SF_IDENTIFIER_RE, match.group(1))
+        return []

snowflake/ml/feature_store/feature_view.py CHANGED Viewed

@@ -170,6 +170,7 @@ class FeatureView(lineage_node.LineageNode):
         warehouse: Optional[str] = None,
         initialize: str = "ON_CREATE",
         refresh_mode: str = "AUTO",
+        cluster_by: Optional[List[str]] = None,
         **_kwargs: Any,
     ) -> None:
         """
@@ -200,6 +201,9 @@ class FeatureView(lineage_node.LineageNode):
             refresh_mode: The refresh mode of managed feature view. The value can be 'AUTO', 'FULL' or 'INCREMENETAL'.
                 For managed feature view, the default value is 'AUTO'. For static feature view it has no effect.
                 Check https://docs.snowflake.com/en/sql-reference/sql/create-dynamic-table for for details.
+            cluster_by: Columns to cluster the feature view by.
+                - Defaults to the join keys from entities.
+                - If `timestamp_col` is provided, it is added to the default clustering keys.
             _kwargs: reserved kwargs for system generated args. NOTE: DO NOT USE.
         Example::
@@ -224,6 +228,7 @@ class FeatureView(lineage_node.LineageNode):
             >>> print(registered_fv.status)
             FeatureViewStatus.ACTIVE
+        # noqa: DAR401
         """
         self._name: SqlIdentifier = SqlIdentifier(name)
@@ -233,7 +238,7 @@ class FeatureView(lineage_node.LineageNode):
             SqlIdentifier(timestamp_col) if timestamp_col is not None else None
         )
         self._desc: str = desc
-        self._infer_schema_df: DataFrame = _kwargs.get("_infer_schema_df", self._feature_df)
+        self._infer_schema_df: DataFrame = _kwargs.pop("_infer_schema_df", self._feature_df)
         self._query: str = self._get_query()
         self._version: Optional[FeatureViewVersion] = None
         self._status: FeatureViewStatus = FeatureViewStatus.DRAFT
@@ -249,6 +254,14 @@ class FeatureView(lineage_node.LineageNode):
         self._refresh_mode: Optional[str] = refresh_mode
         self._refresh_mode_reason: Optional[str] = None
         self._owner: Optional[str] = None
+        self._cluster_by: List[SqlIdentifier] = (
+            [SqlIdentifier(col) for col in cluster_by] if cluster_by is not None else self._get_default_cluster_by()
+        )
+        # Validate kwargs
+        if _kwargs:
+            raise TypeError(f"FeatureView.__init__ got an unexpected keyword argument: '{next(iter(_kwargs.keys()))}'")
         self._validate()
     def slice(self, names: List[str]) -> FeatureViewSlice:
@@ -394,6 +407,10 @@ class FeatureView(lineage_node.LineageNode):
     def timestamp_col(self) -> Optional[SqlIdentifier]:
         return self._timestamp_col
+    @property
+    def cluster_by(self) -> Optional[List[SqlIdentifier]]:
+        return self._cluster_by
     @property
     def desc(self) -> str:
         return self._desc
@@ -656,6 +673,14 @@ Got {len(self._feature_df.queries['queries'])}: {self._feature_df.queries['queri
                 if not isinstance(col_type, (DateType, TimeType, TimestampType, _NumericType)):
                     raise ValueError(f"Invalid data type for timestamp_col {ts_col}: {col_type}.")
+            if self.cluster_by is not None:
+                for column in self.cluster_by:
+                    if column not in df_cols:
+                        raise ValueError(
+                            f"Column '{column}' in `cluster_by` is not in the feature DataFrame schema. "
+                            f"{df_cols}, {self.cluster_by}"
+                        )
         if re.match(_RESULT_SCAN_QUERY_PATTERN, self._query) is not None:
             raise ValueError(f"feature_df should not be reading from RESULT_SCAN. Invalid query: {self._query}")
@@ -890,6 +915,7 @@ Got {len(self._feature_df.queries['queries'])}: {self._feature_df.queries['queri
         owner: Optional[str],
         infer_schema_df: Optional[DataFrame],
         session: Session,
+        cluster_by: Optional[List[str]] = None,
     ) -> FeatureView:
         fv = FeatureView(
             name=name,
@@ -898,6 +924,7 @@ Got {len(self._feature_df.queries['queries'])}: {self._feature_df.queries['queri
             timestamp_col=timestamp_col,
             desc=desc,
             _infer_schema_df=infer_schema_df,
+            cluster_by=cluster_by,
         )
         fv._version = FeatureViewVersion(version) if version is not None else None
         fv._status = status
@@ -916,5 +943,23 @@ Got {len(self._feature_df.queries['queries'])}: {self._feature_df.queries['queri
         )
         return fv
+    #
+    def _get_default_cluster_by(self) -> List[SqlIdentifier]:
+        """
+        Get default columns to cluster the feature view by.
+        Default cluster_by columns are join keys from entities and timestamp_col if it exists
+        Returns:
+            List of SqlIdentifiers representing the default columns to cluster the feature view by.
+        """
+        # We don't focus on the order of entities here, as users can define a custom 'cluster_by'
+        # if a specific order is required.
+        default_cluster_by_cols = [key for entity in self.entities if entity.join_keys for key in entity.join_keys]
+        if self.timestamp_col:
+            default_cluster_by_cols.append(self.timestamp_col)
+        return default_cluster_by_cols
 lineage_node.DOMAIN_LINEAGE_REGISTRY["feature_view"] = FeatureView

snowflake/ml/fileset/fileset.py CHANGED Viewed

@@ -257,7 +257,6 @@ class FileSet:
                     function_name=telemetry.get_statement_params_full_func_name(
                         inspect.currentframe(), cls.__class__.__name__
                     ),
-                    api_calls=[snowpark.DataFrameWriter.copy_into_location],
                 ),
             )
         except snowpark_exceptions.SnowparkSQLException as e:

snowflake-ml-python 1.7.4__py3-none-any.whl → 1.8.0__py3-none-any.whl

snowflake-ml-python 1.7.4py3-none-any.whl → 1.8.0py3-none-any.whl