PyPI - snowflake-ml-python - Versions diffs - 1.5.1__py3-none-any.whl → 1.5.3__py3-none-any.whl - Mend

snowflake-ml-python 1.5.1py3-none-any.whl → 1.5.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (207) hide show

snowflake/ml/modeling/_internal/snowpark_implementations/distributed_search_udf_file.py ADDED Viewed

@@ -0,0 +1,161 @@
+"""
+Description:
+    This is the helper file for distributed_hpo_trainer.py to create UDTF by `register_from_file`.
+Performance Benefits:
+    The performance benefits come from two aspects,
+    1. register_from_file can reduce duplicating loading data by only loading data once in each node
+    2. register_from_file enable user to load data in global variable, whereas writing UDF in python script cannot.
+Developer Tips:
+    Because this script is now a string, so there's no type hinting, linting, etc. It is highly recommended
+    to develop in a python script, test the type hinting, and then convert it into a string.
+"""
+execute_template = """
+from typing import Tuple, Any, List, Dict, Set, Iterator
+import os
+import sys
+import pandas as pd
+import numpy as np
+import numpy.typing as npt
+import cloudpickle as cp
+import io
+def _load_data_into_udf() -> Tuple[
+    npt.NDArray[Any],
+    npt.NDArray[Any],
+    List[List[int]],
+    List[Dict[str, Any]],
+    object,
+    Dict[str, Any],
+    Dict[str, Any],
+]:
+    import pyarrow.parquet as pq
+    data_files = [
+        filename
+        for filename in os.listdir(sys._xoptions["snowflake_import_directory"])
+        if filename.startswith("dataset")
+    ]
+    partial_df = [
+        pq.read_table(os.path.join(sys._xoptions["snowflake_import_directory"], file_name)).to_pandas()
+        for file_name in data_files
+    ]
+    df = pd.concat(partial_df, ignore_index=True)
+    constant_file_path = None
+    for filename in os.listdir(sys._xoptions["snowflake_import_directory"]):
+        if filename.startswith("constant"):
+            constant_file_path = os.path.join(sys._xoptions["snowflake_import_directory"], f"{filename}")
+    if constant_file_path is None:
+        raise ValueError("UDTF cannot find the constant location, abort!")
+    with open(constant_file_path, mode="rb") as constant_file_obj:
+        CONSTANTS = cp.load(constant_file_obj)
+    df.columns = CONSTANTS['dataset_snowpark_cols']
+    # load parameter grid
+    local_estimator_file_path = os.path.join(
+        sys._xoptions["snowflake_import_directory"],
+        f"{CONSTANTS['estimator_location']}"
+    )
+    with open(local_estimator_file_path, mode="rb") as local_estimator_file_obj:
+        estimator_objects = cp.load(local_estimator_file_obj)
+        params_to_evaluate = estimator_objects["param_grid"]
+    # load indices
+    local_indices_file_path = os.path.join(
+        sys._xoptions["snowflake_import_directory"],
+        f"{CONSTANTS['indices_location']}"
+    )
+    with open(local_indices_file_path, mode="rb") as local_indices_file_obj:
+        indices = cp.load(local_indices_file_obj)
+    # load base estimator
+    local_base_estimator_file_path = os.path.join(
+        sys._xoptions["snowflake_import_directory"], f"{CONSTANTS['base_estimator_location']}"
+    )
+    with open(local_base_estimator_file_path, mode="rb") as local_base_estimator_file_obj:
+        base_estimator = cp.load(local_base_estimator_file_obj)
+    # load fit_and_score_kwargs
+    local_fit_and_score_kwargs_file_path = os.path.join(
+        sys._xoptions["snowflake_import_directory"], f"{CONSTANTS['fit_and_score_kwargs_location']}"
+    )
+    with open(local_fit_and_score_kwargs_file_path, mode="rb") as local_fit_and_score_kwargs_file_obj:
+        fit_and_score_kwargs = cp.load(local_fit_and_score_kwargs_file_obj)
+    # convert dataframe to numpy would save memory consumption
+    return (
+        df[CONSTANTS['input_cols']].to_numpy(),
+        df[CONSTANTS['label_cols']].squeeze().to_numpy(),
+        indices,
+        params_to_evaluate,
+        base_estimator,
+        fit_and_score_kwargs,
+        CONSTANTS
+    )
+global_load_data = _load_data_into_udf()
+# Note Table functions (UDTFs) have a limit of 500 input arguments and 500 output columns.
+class SearchCV:
+    def __init__(self) -> None:
+        X, y, indices, params_to_evaluate, base_estimator, fit_and_score_kwargs, CONSTANTS = global_load_data
+        self.X = X
+        self.y = y
+        self.test_indices = indices
+        self.params_to_evaluate = params_to_evaluate
+        self.base_estimator = base_estimator
+        self.fit_and_score_kwargs = fit_and_score_kwargs
+        self.fit_score_params: List[Any] = []
+        self.CONSTANTS = CONSTANTS
+        self.cv_indices_set: Set[int] = set()
+    def process(self, idx: int, params_idx: int, cv_idx: int) -> None:
+        self.fit_score_params.extend([[idx, params_idx, cv_idx]])
+        self.cv_indices_set.add(cv_idx)
+    def end_partition(self) -> Iterator[Tuple[int, str]]:
+        from sklearn.base import clone
+        from sklearn.model_selection._validation import _fit_and_score
+        from sklearn.utils.parallel import Parallel, delayed
+        cached_train_test_indices = {}
+        # Calculate the full index here to avoid duplicate calculation (which consumes a lot of memory)
+        full_index = np.arange(self.CONSTANTS['DATA_LENGTH'])
+        for i in self.cv_indices_set:
+            cached_train_test_indices[i] = [
+                np.setdiff1d(full_index, self.test_indices[i]),
+                self.test_indices[i],
+            ]
+        parallel = Parallel(n_jobs=self.CONSTANTS['_N_JOBS'], pre_dispatch=self.CONSTANTS['_PRE_DISPATCH'])
+        out = parallel(
+            delayed(_fit_and_score)(
+                clone(self.base_estimator),
+                self.X,
+                self.y,
+                train=cached_train_test_indices[split_idx][0],
+                test=cached_train_test_indices[split_idx][1],
+                parameters=self.params_to_evaluate[cand_idx],
+                split_progress=(split_idx, self.CONSTANTS['n_splits']),
+                candidate_progress=(cand_idx, self.CONSTANTS['n_candidates']),
+                **self.fit_and_score_kwargs,  # load sample weight here
+            )
+            for _, cand_idx, split_idx in self.fit_score_params
+        )
+        binary_cv_results = None
+        with io.BytesIO() as f:
+            cp.dump(out, f)
+            f.seek(0)
+            binary_cv_results = f.getvalue().hex()
+        yield (
+            self.fit_score_params[0][0],
+            binary_cv_results,
+        )
+SearchCV._sf_node_singleton = True
+"""

snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py CHANGED Viewed

@@ -2,6 +2,7 @@ import importlib
 import inspect
 import os
 import posixpath
+import sys
 from typing import Any, Dict, List, Optional
 from uuid import uuid4
@@ -13,12 +14,10 @@ from snowflake.ml._internal.utils import (
     identifier,
     pkg_version_utils,
     snowpark_dataframe_utils,
+    temp_file_utils,
 )
 from snowflake.ml._internal.utils.query_result_checker import SqlResultValidator
-from snowflake.ml._internal.utils.temp_file_utils import (
-    cleanup_temp_files,
-    get_temp_file_path,
-)
+from snowflake.ml.modeling._internal import estimator_utils
 from snowflake.ml.modeling._internal.estimator_utils import handle_inference_result
 from snowflake.snowpark import DataFrame, Session, functions as F, types as T
 from snowflake.snowpark._internal.utils import (
@@ -26,7 +25,7 @@ from snowflake.snowpark._internal.utils import (
     random_name_for_temp_object,
 )
-cp.register_pickle_by_value(inspect.getmodule(get_temp_file_path))
+cp.register_pickle_by_value(inspect.getmodule(temp_file_utils.get_temp_file_path))
 cp.register_pickle_by_value(inspect.getmodule(identifier.get_inferred_name))
 cp.register_pickle_by_value(inspect.getmodule(handle_inference_result))
@@ -97,7 +96,25 @@ class SnowparkTransformHandlers:
         dependencies = self._get_validated_snowpark_dependencies(session, dependencies)
         dataset = self.dataset
-        estimator = self.estimator
+        statement_params = telemetry.get_function_usage_statement_params(
+            project=_PROJECT,
+            subproject=self._subproject,
+            function_name=telemetry.get_statement_params_full_func_name(inspect.currentframe(), self._class_name),
+            api_calls=[F.pandas_udf],
+            custom_tags={"autogen": True} if self._autogenerated else None,
+        )
+        temp_stage_name = estimator_utils.create_temp_stage(session)
+        estimator_file_name = estimator_utils.upload_model_to_stage(
+            stage_name=temp_stage_name,
+            estimator=self.estimator,
+            session=session,
+            statement_params=statement_params,
+        )
+        imports = [f"@{temp_stage_name}/{estimator_file_name}"]
         # Register vectorized UDF for batch inference
         batch_inference_udf_name = random_name_for_temp_object(TempObjectType.FUNCTION)
@@ -113,13 +130,13 @@ class SnowparkTransformHandlers:
         for field in fields:
             input_datatypes.append(field.datatype)
-        statement_params = telemetry.get_function_usage_statement_params(
-            project=_PROJECT,
-            subproject=self._subproject,
-            function_name=telemetry.get_statement_params_full_func_name(inspect.currentframe(), self._class_name),
-            api_calls=[F.pandas_udf],
-            custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
-        )
+        # TODO(xjiang): for optimization, use register_from_file to reduce duplicate loading estimator object
+        # or use cachetools here
+        def load_estimator() -> object:
+            estimator_file_path = os.path.join(sys._xoptions["snowflake_import_directory"], f"{estimator_file_name}")
+            with open(estimator_file_path, mode="rb") as local_estimator_file_obj:
+                estimator_object = cp.load(local_estimator_file_obj)
+            return estimator_object
         @F.pandas_udf(  # type: ignore[arg-type, misc]
             is_permanent=False,
@@ -129,6 +146,7 @@ class SnowparkTransformHandlers:
             session=session,
             statement_params=statement_params,
             input_types=[T.PandasDataFrameType(input_datatypes)],
+            imports=imports,  # type: ignore[arg-type]
         )
         def vec_batch_infer(input_df: pd.DataFrame) -> T.PandasSeries[dict]:  # type: ignore[type-arg]
             import numpy as np  # noqa: F401
@@ -136,6 +154,8 @@ class SnowparkTransformHandlers:
             input_df.columns = snowpark_cols
+            estimator = load_estimator()
             if hasattr(estimator, "n_jobs"):
                 # Vectorized UDF cannot handle joblib multiprocessing right now, deactivate the n_jobs
                 estimator.n_jobs = 1
@@ -225,7 +245,7 @@ class SnowparkTransformHandlers:
         queries = dataset.queries["queries"]
         # Create a temp file and dump the score to that file.
-        local_score_file_name = get_temp_file_path()
+        local_score_file_name = temp_file_utils.get_temp_file_path()
         with open(local_score_file_name, mode="w+b") as local_score_file:
             cp.dump(estimator, local_score_file)
@@ -247,7 +267,7 @@ class SnowparkTransformHandlers:
                 inspect.currentframe(), self.__class__.__name__
             ),
             api_calls=[F.sproc],
-            custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
+            custom_tags={"autogen": True} if self._autogenerated else None,
         )
         # Put locally serialized score on stage.
         session.file.put(
@@ -290,7 +310,7 @@ class SnowparkTransformHandlers:
             df: pd.DataFrame = sp_df.to_pandas(statement_params=score_statement_params)
             df.columns = sp_df.columns
-            local_score_file_name = get_temp_file_path()
+            local_score_file_name = temp_file_utils.get_temp_file_path()
             session.file.get(stage_score_file_name, local_score_file_name, statement_params=score_statement_params)
             local_score_file_name_path = os.path.join(local_score_file_name, os.listdir(local_score_file_name)[0])
@@ -323,7 +343,7 @@ class SnowparkTransformHandlers:
                 inspect.currentframe(), self.__class__.__name__
             ),
             api_calls=[Session.call],
-            custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
+            custom_tags={"autogen": True} if self._autogenerated else None,
         )
         kwargs = telemetry.get_sproc_statement_params_kwargs(score_wrapper_sproc, score_statement_params)
@@ -338,7 +358,7 @@ class SnowparkTransformHandlers:
             **kwargs,
         )
-        cleanup_temp_files([local_score_file_name])
+        temp_file_utils.cleanup_temp_files([local_score_file_name])
         return score

snowflake-ml-python 1.5.1__py3-none-any.whl → 1.5.3__py3-none-any.whl

snowflake-ml-python 1.5.1py3-none-any.whl → 1.5.3py3-none-any.whl