PyPI - snowflake-ml-python - Versions diffs - 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl - Mend

snowflake-ml-python 1.4.0py3-none-any.whl → 1.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (234) hide show

snowflake/ml/_internal/env_utils.py +77 -32
snowflake/ml/_internal/exceptions/dataset_error_messages.py +5 -0
snowflake/ml/_internal/exceptions/dataset_errors.py +24 -0
snowflake/ml/_internal/exceptions/error_codes.py +3 -0
snowflake/ml/_internal/lineage/data_source.py +10 -0
snowflake/ml/_internal/lineage/dataset_dataframe.py +44 -0
snowflake/ml/_internal/utils/identifier.py +3 -1
snowflake/ml/_internal/utils/sql_identifier.py +2 -6
snowflake/ml/dataset/__init__.py +10 -0
snowflake/ml/dataset/dataset.py +454 -129
snowflake/ml/dataset/dataset_factory.py +53 -0
snowflake/ml/dataset/dataset_metadata.py +103 -0
snowflake/ml/dataset/dataset_reader.py +202 -0
snowflake/ml/feature_store/feature_store.py +531 -332
snowflake/ml/feature_store/feature_view.py +40 -23
snowflake/ml/fileset/embedded_stage_fs.py +146 -0
snowflake/ml/fileset/sfcfs.py +56 -54
snowflake/ml/fileset/snowfs.py +159 -0
snowflake/ml/fileset/stage_fs.py +49 -17
snowflake/ml/model/__init__.py +2 -2
snowflake/ml/model/_api.py +16 -1
snowflake/ml/model/_client/model/model_impl.py +27 -0
snowflake/ml/model/_client/model/model_version_impl.py +137 -50
snowflake/ml/model/_client/ops/model_ops.py +159 -40
snowflake/ml/model/_client/sql/model.py +25 -2
snowflake/ml/model/_client/sql/model_version.py +131 -2
snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +5 -1
snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +1 -0
snowflake/ml/model/_deploy_client/snowservice/deploy.py +2 -0
snowflake/ml/model/_deploy_client/utils/constants.py +0 -5
snowflake/ml/model/_deploy_client/utils/snowservice_client.py +21 -50
snowflake/ml/model/_model_composer/model_composer.py +22 -1
snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +38 -51
snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +19 -1
snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
snowflake/ml/model/_packager/model_env/model_env.py +41 -0
snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
snowflake/ml/model/_packager/model_meta/model_meta.py +37 -11
snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
snowflake/ml/model/_packager/model_packager.py +2 -5
snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
snowflake/ml/model/type_hints.py +21 -2
snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +55 -3
snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +34 -18
snowflake/ml/modeling/_internal/model_trainer.py +7 -0
snowflake/ml/modeling/_internal/model_trainer_builder.py +42 -9
snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +29 -7
snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +261 -16
snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +246 -175
snowflake/ml/modeling/cluster/affinity_propagation.py +246 -175
snowflake/ml/modeling/cluster/agglomerative_clustering.py +246 -175
snowflake/ml/modeling/cluster/birch.py +248 -175
snowflake/ml/modeling/cluster/bisecting_k_means.py +248 -175
snowflake/ml/modeling/cluster/dbscan.py +246 -175
snowflake/ml/modeling/cluster/feature_agglomeration.py +248 -175
snowflake/ml/modeling/cluster/k_means.py +248 -175
snowflake/ml/modeling/cluster/mean_shift.py +246 -175
snowflake/ml/modeling/cluster/mini_batch_k_means.py +248 -175
snowflake/ml/modeling/cluster/optics.py +246 -175
snowflake/ml/modeling/cluster/spectral_biclustering.py +246 -175
snowflake/ml/modeling/cluster/spectral_clustering.py +246 -175
snowflake/ml/modeling/cluster/spectral_coclustering.py +246 -175
snowflake/ml/modeling/compose/column_transformer.py +248 -175
snowflake/ml/modeling/compose/transformed_target_regressor.py +246 -175
snowflake/ml/modeling/covariance/elliptic_envelope.py +246 -175
snowflake/ml/modeling/covariance/empirical_covariance.py +246 -175
snowflake/ml/modeling/covariance/graphical_lasso.py +246 -175
snowflake/ml/modeling/covariance/graphical_lasso_cv.py +246 -175
snowflake/ml/modeling/covariance/ledoit_wolf.py +246 -175
snowflake/ml/modeling/covariance/min_cov_det.py +246 -175
snowflake/ml/modeling/covariance/oas.py +246 -175
snowflake/ml/modeling/covariance/shrunk_covariance.py +246 -175
snowflake/ml/modeling/decomposition/dictionary_learning.py +248 -175
snowflake/ml/modeling/decomposition/factor_analysis.py +248 -175
snowflake/ml/modeling/decomposition/fast_ica.py +248 -175
snowflake/ml/modeling/decomposition/incremental_pca.py +248 -175
snowflake/ml/modeling/decomposition/kernel_pca.py +248 -175
snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +248 -175
snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +248 -175
snowflake/ml/modeling/decomposition/pca.py +248 -175
snowflake/ml/modeling/decomposition/sparse_pca.py +248 -175
snowflake/ml/modeling/decomposition/truncated_svd.py +248 -175
snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +248 -175
snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +246 -175
snowflake/ml/modeling/ensemble/ada_boost_classifier.py +246 -175
snowflake/ml/modeling/ensemble/ada_boost_regressor.py +246 -175
snowflake/ml/modeling/ensemble/bagging_classifier.py +246 -175
snowflake/ml/modeling/ensemble/bagging_regressor.py +246 -175
snowflake/ml/modeling/ensemble/extra_trees_classifier.py +246 -175
snowflake/ml/modeling/ensemble/extra_trees_regressor.py +246 -175
snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +246 -175
snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +246 -175
snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +246 -175
snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +246 -175
snowflake/ml/modeling/ensemble/isolation_forest.py +246 -175
snowflake/ml/modeling/ensemble/random_forest_classifier.py +246 -175
snowflake/ml/modeling/ensemble/random_forest_regressor.py +246 -175
snowflake/ml/modeling/ensemble/stacking_regressor.py +248 -175
snowflake/ml/modeling/ensemble/voting_classifier.py +248 -175
snowflake/ml/modeling/ensemble/voting_regressor.py +248 -175
snowflake/ml/modeling/feature_selection/generic_univariate_select.py +248 -175
snowflake/ml/modeling/feature_selection/select_fdr.py +248 -175
snowflake/ml/modeling/feature_selection/select_fpr.py +248 -175
snowflake/ml/modeling/feature_selection/select_fwe.py +248 -175
snowflake/ml/modeling/feature_selection/select_k_best.py +248 -175
snowflake/ml/modeling/feature_selection/select_percentile.py +248 -175
snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +248 -175
snowflake/ml/modeling/feature_selection/variance_threshold.py +248 -175
snowflake/ml/modeling/framework/_utils.py +8 -1
snowflake/ml/modeling/framework/base.py +72 -37
snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +246 -175
snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +246 -175
snowflake/ml/modeling/impute/iterative_imputer.py +248 -175
snowflake/ml/modeling/impute/knn_imputer.py +248 -175
snowflake/ml/modeling/impute/missing_indicator.py +248 -175
snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +248 -175
snowflake/ml/modeling/kernel_approximation/nystroem.py +248 -175
snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +248 -175
snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +248 -175
snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +248 -175
snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +246 -175
snowflake/ml/modeling/lightgbm/lgbm_classifier.py +246 -175
snowflake/ml/modeling/lightgbm/lgbm_regressor.py +246 -175
snowflake/ml/modeling/linear_model/ard_regression.py +246 -175
snowflake/ml/modeling/linear_model/bayesian_ridge.py +246 -175
snowflake/ml/modeling/linear_model/elastic_net.py +246 -175
snowflake/ml/modeling/linear_model/elastic_net_cv.py +246 -175
snowflake/ml/modeling/linear_model/gamma_regressor.py +246 -175
snowflake/ml/modeling/linear_model/huber_regressor.py +246 -175
snowflake/ml/modeling/linear_model/lars.py +246 -175
snowflake/ml/modeling/linear_model/lars_cv.py +246 -175
snowflake/ml/modeling/linear_model/lasso.py +246 -175
snowflake/ml/modeling/linear_model/lasso_cv.py +246 -175
snowflake/ml/modeling/linear_model/lasso_lars.py +246 -175
snowflake/ml/modeling/linear_model/lasso_lars_cv.py +246 -175
snowflake/ml/modeling/linear_model/lasso_lars_ic.py +246 -175
snowflake/ml/modeling/linear_model/linear_regression.py +246 -175
snowflake/ml/modeling/linear_model/logistic_regression.py +246 -175
snowflake/ml/modeling/linear_model/logistic_regression_cv.py +246 -175
snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +246 -175
snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +246 -175
snowflake/ml/modeling/linear_model/multi_task_lasso.py +246 -175
snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +246 -175
snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +246 -175
snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +246 -175
snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +246 -175
snowflake/ml/modeling/linear_model/perceptron.py +246 -175
snowflake/ml/modeling/linear_model/poisson_regressor.py +246 -175
snowflake/ml/modeling/linear_model/ransac_regressor.py +246 -175
snowflake/ml/modeling/linear_model/ridge.py +246 -175
snowflake/ml/modeling/linear_model/ridge_classifier.py +246 -175
snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +246 -175
snowflake/ml/modeling/linear_model/ridge_cv.py +246 -175
snowflake/ml/modeling/linear_model/sgd_classifier.py +246 -175
snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +246 -175
snowflake/ml/modeling/linear_model/sgd_regressor.py +246 -175
snowflake/ml/modeling/linear_model/theil_sen_regressor.py +246 -175
snowflake/ml/modeling/linear_model/tweedie_regressor.py +246 -175
snowflake/ml/modeling/manifold/isomap.py +248 -175
snowflake/ml/modeling/manifold/mds.py +248 -175
snowflake/ml/modeling/manifold/spectral_embedding.py +248 -175
snowflake/ml/modeling/manifold/tsne.py +248 -175
snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +246 -175
snowflake/ml/modeling/mixture/gaussian_mixture.py +246 -175
snowflake/ml/modeling/model_selection/grid_search_cv.py +63 -41
snowflake/ml/modeling/model_selection/randomized_search_cv.py +80 -38
snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +246 -175
snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +246 -175
snowflake/ml/modeling/multiclass/output_code_classifier.py +246 -175
snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +246 -175
snowflake/ml/modeling/naive_bayes/categorical_nb.py +246 -175
snowflake/ml/modeling/naive_bayes/complement_nb.py +246 -175
snowflake/ml/modeling/naive_bayes/gaussian_nb.py +246 -175
snowflake/ml/modeling/naive_bayes/multinomial_nb.py +246 -175
snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +246 -175
snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +246 -175
snowflake/ml/modeling/neighbors/kernel_density.py +246 -175
snowflake/ml/modeling/neighbors/local_outlier_factor.py +246 -175
snowflake/ml/modeling/neighbors/nearest_centroid.py +246 -175
snowflake/ml/modeling/neighbors/nearest_neighbors.py +246 -175
snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +248 -175
snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +246 -175
snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +246 -175
snowflake/ml/modeling/neural_network/bernoulli_rbm.py +248 -175
snowflake/ml/modeling/neural_network/mlp_classifier.py +246 -175
snowflake/ml/modeling/neural_network/mlp_regressor.py +246 -175
snowflake/ml/modeling/pipeline/pipeline.py +517 -35
snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
snowflake/ml/modeling/preprocessing/one_hot_encoder.py +13 -5
snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
snowflake/ml/modeling/preprocessing/polynomial_features.py +248 -175
snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
snowflake/ml/modeling/semi_supervised/label_propagation.py +246 -175
snowflake/ml/modeling/semi_supervised/label_spreading.py +246 -175
snowflake/ml/modeling/svm/linear_svc.py +246 -175
snowflake/ml/modeling/svm/linear_svr.py +246 -175
snowflake/ml/modeling/svm/nu_svc.py +246 -175
snowflake/ml/modeling/svm/nu_svr.py +246 -175
snowflake/ml/modeling/svm/svc.py +246 -175
snowflake/ml/modeling/svm/svr.py +246 -175
snowflake/ml/modeling/tree/decision_tree_classifier.py +246 -175
snowflake/ml/modeling/tree/decision_tree_regressor.py +246 -175
snowflake/ml/modeling/tree/extra_tree_classifier.py +246 -175
snowflake/ml/modeling/tree/extra_tree_regressor.py +246 -175
snowflake/ml/modeling/xgboost/xgb_classifier.py +246 -175
snowflake/ml/modeling/xgboost/xgb_regressor.py +246 -175
snowflake/ml/modeling/xgboost/xgbrf_classifier.py +246 -175
snowflake/ml/modeling/xgboost/xgbrf_regressor.py +246 -175
snowflake/ml/registry/model_registry.py +3 -149
snowflake/ml/registry/registry.py +1 -1
snowflake/ml/version.py +1 -1
{snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/METADATA +129 -57
snowflake_ml_python-1.5.0.dist-info/RECORD +380 -0
snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
snowflake/ml/registry/_artifact_manager.py +0 -156
snowflake/ml/registry/artifact.py +0 -46
snowflake_ml_python-1.4.0.dist-info/RECORD +0 -370
{snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/LICENSE.txt +0 -0
{snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/WHEEL +0 -0
{snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/top_level.txt +0 -0

snowflake/ml/model/_packager/model_runtime/model_runtime.py ADDED Viewed

@@ -0,0 +1,137 @@
+import copy
+import pathlib
+import warnings
+from typing import List, Literal, Optional
+from packaging import requirements
+from snowflake.ml._internal import env as snowml_env, env_utils, file_utils
+from snowflake.ml.model._packager.model_env import model_env
+from snowflake.ml.model._packager.model_meta import model_meta_schema
+from snowflake.ml.model._packager.model_runtime import (
+    _snowml_inference_alternative_requirements,
+)
+_SNOWML_INFERENCE_ALTERNATIVE_DEPENDENCIES = [
+    str(env_utils.get_package_spec_with_supported_ops_only(requirements.Requirement(r)))
+    for r in _snowml_inference_alternative_requirements.REQUIREMENTS
+]
+class ModelRuntime:
+    """Class to represent runtime in a model, which controls the runtime and version, imports and dependencies.
+    Attributes:
+        runtime_env: ModelEnv object representing the actual environment when deploying. The environment is based on
+            the environment from the packaged model with additional dependencies required to deploy.
+        imports: List of files to be imported in the created functions. At least packed model should be imported.
+            If the required Snowpark ML library is not available in the server-side, we will automatically pack the
+            local version as well as "snowflake-ml-python.zip" and added into the imports.
+    """
+    RUNTIME_DIR_REL_PATH = "runtimes"
+    def __init__(
+        self,
+        name: str,
+        env: model_env.ModelEnv,
+        imports: Optional[List[pathlib.PurePosixPath]] = None,
+        is_gpu: bool = False,
+        server_availability_source: Literal["snowflake", "conda"] = "snowflake",
+        loading_from_file: bool = False,
+    ) -> None:
+        self.name = name
+        self.runtime_env = copy.deepcopy(env)
+        self.imports = imports or []
+        if loading_from_file:
+            return
+        snowml_pkg_spec = f"{env_utils.SNOWPARK_ML_PKG_NAME}=={self.runtime_env.snowpark_ml_version}"
+        if self.runtime_env._snowpark_ml_version.local:
+            self.embed_local_ml_library = True
+        else:
+            if server_availability_source == "snowflake":
+                snowml_server_availability = (
+                    len(
+                        env_utils.get_matched_package_versions_in_information_schema_with_active_session(
+                            reqs=[requirements.Requirement(snowml_pkg_spec)],
+                            python_version=snowml_env.PYTHON_VERSION,
+                        ).get(env_utils.SNOWPARK_ML_PKG_NAME, [])
+                    )
+                    >= 1
+                )
+            else:
+                snowml_server_availability = (
+                    len(
+                        env_utils.get_matched_package_versions_in_snowflake_conda_channel(
+                            req=requirements.Requirement(snowml_pkg_spec),
+                            python_version=snowml_env.PYTHON_VERSION,
+                        )
+                    )
+                    >= 1
+                )
+            self.embed_local_ml_library = not snowml_server_availability
+        additional_package = (
+            _SNOWML_INFERENCE_ALTERNATIVE_DEPENDENCIES if self.embed_local_ml_library else [snowml_pkg_spec]
+        )
+        self.runtime_env.include_if_absent(
+            [
+                model_env.ModelDependency(requirement=dep, pip_name=requirements.Requirement(dep).name)
+                for dep in additional_package
+            ],
+        )
+        if is_gpu:
+            self.runtime_env.generate_env_for_cuda()
+    @property
+    def runtime_rel_path(self) -> pathlib.PurePosixPath:
+        return pathlib.PurePosixPath(ModelRuntime.RUNTIME_DIR_REL_PATH) / self.name
+    def save(self, packager_path: pathlib.Path) -> model_meta_schema.ModelRuntimeDict:
+        runtime_base_path = packager_path / self.runtime_rel_path
+        runtime_base_path.mkdir(parents=True, exist_ok=True)
+        if getattr(self, "embed_local_ml_library", False):
+            snowpark_ml_lib_path = runtime_base_path / "snowflake-ml-python.zip"
+            file_utils.zip_python_package(str(snowpark_ml_lib_path), "snowflake.ml")
+            snowpark_ml_lib_rel_path = pathlib.PurePosixPath(snowpark_ml_lib_path.relative_to(packager_path).as_posix())
+            self.imports.append(snowpark_ml_lib_rel_path)
+        self.runtime_env.conda_env_rel_path = self.runtime_rel_path / self.runtime_env.conda_env_rel_path
+        self.runtime_env.pip_requirements_rel_path = self.runtime_rel_path / self.runtime_env.pip_requirements_rel_path
+        env_dict = self.runtime_env.save_as_dict(packager_path)
+        return model_meta_schema.ModelRuntimeDict(
+            imports=list(map(str, self.imports)),
+            dependencies=model_meta_schema.ModelRuntimeDependenciesDict(
+                conda=env_dict["conda"],
+                pip=env_dict["pip"],
+            ),
+        )
+    @staticmethod
+    def load(
+        packager_path: pathlib.Path,
+        name: str,
+        meta_env: model_env.ModelEnv,
+        loaded_dict: model_meta_schema.ModelRuntimeDict,
+    ) -> "ModelRuntime":
+        env = model_env.ModelEnv()
+        env.python_version = meta_env.python_version
+        env.cuda_version = meta_env.cuda_version
+        env.snowpark_ml_version = meta_env.snowpark_ml_version
+        conda_env_rel_path = pathlib.PurePosixPath(loaded_dict["dependencies"]["conda"])
+        pip_requirements_rel_path = pathlib.PurePosixPath(loaded_dict["dependencies"]["pip"])
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            env.load_from_conda_file(packager_path / conda_env_rel_path)
+            env.load_from_pip_file(packager_path / pip_requirements_rel_path)
+        return ModelRuntime(
+            name=name, env=env, imports=list(map(pathlib.PurePosixPath, loaded_dict["imports"])), loading_from_file=True
+        )

snowflake/ml/model/type_hints.py CHANGED Viewed

@@ -19,6 +19,8 @@ from snowflake.ml.model import deploy_platforms
 from snowflake.ml.model._signatures import core
 if TYPE_CHECKING:
+    import catboost
+    import lightgbm
     import mlflow
     import numpy as np
     import pandas as pd
@@ -33,7 +35,6 @@ if TYPE_CHECKING:
     import snowflake.ml.model.custom_model
     import snowflake.ml.model.models.huggingface_pipeline
     import snowflake.ml.model.models.llm
-    import snowflake.ml.model.models.sentence_transformers
     import snowflake.snowpark
     from snowflake.ml.modeling.framework import base  # noqa: F401
@@ -69,6 +70,9 @@ _DataType = TypeVar("_DataType", bound=SupportedDataType)
 CustomModelType = TypeVar("CustomModelType", bound="snowflake.ml.model.custom_model.CustomModel")
 SupportedRequireSignatureModelType = Union[
+    "catboost.CatBoost",
+    "lightgbm.LGBMModel",
+    "lightgbm.Booster",
     "snowflake.ml.model.custom_model.CustomModel",
     "sklearn.base.BaseEstimator",
     "sklearn.pipeline.Pipeline",
@@ -85,7 +89,6 @@ SupportedNoSignatureRequirementsModelType = Union[
     "transformers.Pipeline",
     "sentence_transformers.SentenceTransformer",
     "snowflake.ml.model.models.huggingface_pipeline.HuggingFacePipelineModel",
-    "snowflake.ml.model.models.sentence_transformers.SentenceTransformer",
     "snowflake.ml.model.models.llm.LLM",
 ]
@@ -98,11 +101,14 @@ Here is all acceptable types of Snowflake native model packaging and its handler
 | Type                            | Handler File | Handler             |
 |---------------------------------|--------------|---------------------|
+| catboost.CatBoost       | catboost.py   | _CatBoostModelHandler    |
 | snowflake.ml.model.custom_model.CustomModel | custom.py    | _CustomModelHandler |
 | sklearn.base.BaseEstimator      | sklearn.py   | _SKLModelHandler    |
 | sklearn.pipeline.Pipeline       | sklearn.py   | _SKLModelHandler    |
 | xgboost.XGBModel       | xgboost.py   | _XGBModelHandler    |
 | xgboost.Booster        | xgboost.py   | _XGBModelHandler    |
+| lightgbm.LGBMModel       | lightgbm.py   | _LGBMModelHandler    |
+| lightgbm.Booster        | lightgbm.py   | _LGBMModelHandler    |
 | snowflake.ml.framework.base.BaseEstimator      | snowmlmodel.py   | _SnowMLModelHandler    |
 | torch.nn.Module      | pytroch.py   | _PyTorchHandler    |
 | torch.jit.ScriptModule      | torchscript.py   | _TorchScriptHandler    |
@@ -114,8 +120,10 @@ Here is all acceptable types of Snowflake native model packaging and its handler
 """
 SupportedModelHandlerType = Literal[
+    "catboost",
     "custom",
     "huggingface_pipeline",
+    "lightgbm",
     "mlflow",
     "pytorch",
     "sentence_transformers",
@@ -225,6 +233,11 @@ class BaseModelSaveOption(TypedDict):
     method_options: NotRequired[Dict[str, ModelMethodSaveOptions]]
+class CatBoostModelSaveOptions(BaseModelSaveOption):
+    target_methods: NotRequired[Sequence[str]]
+    cuda_version: NotRequired[str]
 class CustomModelSaveOption(BaseModelSaveOption):
     cuda_version: NotRequired[str]
@@ -238,6 +251,10 @@ class XGBModelSaveOptions(BaseModelSaveOption):
     cuda_version: NotRequired[str]
+class LGBMModelSaveOptions(BaseModelSaveOption):
+    target_methods: NotRequired[Sequence[str]]
 class SNOWModelSaveOptions(BaseModelSaveOption):
     target_methods: NotRequired[Sequence[str]]
@@ -279,7 +296,9 @@ class LLMSaveOptions(BaseModelSaveOption):
 ModelSaveOption = Union[
     BaseModelSaveOption,
+    CatBoostModelSaveOptions,
     CustomModelSaveOption,
+    LGBMModelSaveOptions,
     SKLModelSaveOptions,
     XGBModelSaveOptions,
     SNOWModelSaveOptions,

snowflake/ml/modeling/_internal/estimator_utils.py CHANGED Viewed

@@ -195,21 +195,26 @@ def handle_inference_result(
     shape = transformed_numpy_array.shape
     if len(shape) > 1:
         if shape[1] != len(output_cols):
-            # HeterogeneousEnsemble's transform method produce results with variying shapes
-            # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes).
-            # It is hard to predict the response shape without using fragile introspection logic.
-            # So, to avoid that we are packing the results into a dataframe of shape (n_samples, 1) with
-            # each element being a list.
-            if len(output_cols) != 1:
-                raise TypeError(
-                    "expected_output_cols must be same length as transformed array or should be of length 1."
-                    f"Currently expected_output_cols shape is {len(output_cols)}, "
-                    f"transformed array shape is {shape}. "
-                )
+            # Within UDF, it is not feasible to change the output cols because we need to
+            # query the output cols after UDF by the expected output cols
             if not within_udf:
+                # The following lines are to generate the output cols to match the length of
+                # transformed_numpy_array
                 actual_output_cols = []
                 for i in range(shape[1]):
                     actual_output_cols.append(f"{output_cols[0]}_{i}")
                 output_cols = actual_output_cols
+            else:
+                # HeterogeneousEnsemble's transform method produce results with varying shapes
+                # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes).
+                # It is hard to predict the response shape without using fragile introspection logic.
+                # So, to avoid that we are packing the results into a dataframe of shape (n_samples, 1) with
+                # each element being a list.
+                if len(output_cols) != 1:
+                    raise TypeError(
+                        "expected_output_cols must be same length as transformed array or should be of length 1."
+                        f"Currently expected_output_cols shape is {len(output_cols)}, "
+                        f"transformed array shape is {shape}. "
+                    )
     return transformed_numpy_array, output_cols

snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py CHANGED Viewed

@@ -99,7 +99,10 @@ class PandasTransformHandlers:
                 original_exception=ValueError(
                     "The feature names should match with those that were passed during fit.\n"
                     f"Features seen during fit call but not present in the input: {missing_features}\n"
-                    f"Features in the input dataframe : {input_cols}\n"
+                    f"Features specified with `input_cols` in estimator "
+                    f"{self.estimator.__class__.__name__} in the input dataframe: {input_cols}\n"
+                    f"In your input dataset for current method '{inference_method}', the features are:"
+                    f" {features_in_dataset}."
                 ),
             )
         input_df = dataset[columns_to_select]

snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py CHANGED Viewed

@@ -3,6 +3,8 @@ from typing import List, Optional, Tuple
 import pandas as pd
+from snowflake.ml.modeling._internal.estimator_utils import handle_inference_result
 class PandasModelTrainer:
     """
@@ -72,11 +74,61 @@ class PandasModelTrainer:
             Tuple[pd.DataFrame, object]: [predicted dataset, estimator]
         """
         assert hasattr(self.estimator, "fit_predict")  # make type checker happy
-        args = {"X": self.dataset[self.input_cols]}
-        result = self.estimator.fit_predict(**args)
+        result = self.estimator.fit_predict(X=self.dataset[self.input_cols])
         result_df = pd.DataFrame(data=result, columns=expected_output_cols_list)
         if drop_input_cols:
             result_df = result_df
         else:
-            result_df = pd.concat([self.dataset, result_df], axis=1)
+            # in case the output column name overlap with the input column names,
+            # remove the ones in input column names
+            remove_dataset_col_name_exist_in_output_col = list(
+                set(self.dataset.columns) - set(expected_output_cols_list)
+            )
+            result_df = pd.concat([self.dataset[remove_dataset_col_name_exist_in_output_col], result_df], axis=1)
+        return (result_df, self.estimator)
+    def train_fit_transform(
+        self,
+        expected_output_cols_list: List[str],
+        drop_input_cols: Optional[bool] = False,
+    ) -> Tuple[pd.DataFrame, object]:
+        """Trains the model using specified features and target columns from the dataset.
+        This API is different from fit itself because it would also provide the transform
+        output.
+        Args:
+            expected_output_cols_list (List[str]): The output columns
+                name as a list. Defaults to None.
+            drop_input_cols (Optional[bool]): Boolean to determine whether to
+                drop the input columns from the output dataset.
+        Returns:
+            Tuple[pd.DataFrame, object]: [transformed dataset, estimator]
+        """
+        assert hasattr(self.estimator, "fit")  # make type checker happy
+        assert hasattr(self.estimator, "fit_transform")  # make type checker happy
+        argspec = inspect.getfullargspec(self.estimator.fit)
+        args = {"X": self.dataset[self.input_cols]}
+        if self.label_cols:
+            label_arg_name = "Y" if "Y" in argspec.args else "y"
+            args[label_arg_name] = self.dataset[self.label_cols].squeeze()
+        if self.sample_weight_col is not None and "sample_weight" in argspec.args:
+            args["sample_weight"] = self.dataset[self.sample_weight_col].squeeze()
+        inference_res = self.estimator.fit_transform(**args)
+        transformed_numpy_array, output_cols = handle_inference_result(
+            inference_res=inference_res, output_cols=expected_output_cols_list, inference_method="fit_transform"
+        )
+        result_df = pd.DataFrame(data=transformed_numpy_array, columns=output_cols)
+        if drop_input_cols:
+            result_df = result_df
+        else:
+            # in case the output column name overlap with the input column names,
+            # remove the ones in input column names
+            remove_dataset_col_name_exist_in_output_col = list(set(self.dataset.columns) - set(output_cols))
+            result_df = pd.concat([self.dataset[remove_dataset_col_name_exist_in_output_col], result_df], axis=1)
         return (result_df, self.estimator)

snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py CHANGED Viewed

@@ -72,24 +72,40 @@ class MLRuntimeTransformHandlers:
         """
-        handler = SnowparkTransformHandlers(
-            dataset=self.dataset,
-            estimator=self.estimator,
-            class_name=self._class_name,
-            subproject=self._subproject,
-            autogenerated=self._autogenerated,
-        )
-        return handler.batch_inference(
-            inference_method,
-            input_cols,
-            expected_output_cols,
-            session,
-            dependencies,
-            drop_input_cols,
-            expected_output_cols_type,
-            *args,
-            **kwargs,
-        )
+        mlrs_inference_methods = ["predict", "predict_proba", "predict_log_proba"]
+        if inference_method in mlrs_inference_methods:
+            result_df = self.client.inference(
+                estimator=self.estimator,
+                dataset=self.dataset,
+                inference_method=inference_method,
+                input_cols=input_cols,
+                output_cols=expected_output_cols,
+                drop_input_cols=drop_input_cols,
+            )
+        else:
+            handler = SnowparkTransformHandlers(
+                dataset=self.dataset,
+                estimator=self.estimator,
+                class_name=self._class_name,
+                subproject=self._subproject,
+                autogenerated=self._autogenerated,
+            )
+            result_df = handler.batch_inference(
+                inference_method,
+                input_cols,
+                expected_output_cols,
+                session,
+                dependencies,
+                drop_input_cols,
+                expected_output_cols_type,
+                *args,
+                **kwargs,
+            )
+        assert isinstance(result_df, DataFrame)  # mypy - The MLRS return types are annotated as `object`.
+        return result_df
     def score(
         self,

snowflake/ml/modeling/_internal/model_trainer.py CHANGED Viewed

@@ -22,3 +22,10 @@ class ModelTrainer(Protocol):
         drop_input_cols: Optional[bool] = False,
     ) -> Tuple[Union[DataFrame, pd.DataFrame], object]:
         raise NotImplementedError
+    def train_fit_transform(
+        self,
+        expected_output_cols_list: List[str],
+        drop_input_cols: Optional[bool] = False,
+    ) -> Tuple[Union[DataFrame, pd.DataFrame], object]:
+        raise NotImplementedError

snowflake/ml/modeling/_internal/model_trainer_builder.py CHANGED Viewed

@@ -138,21 +138,13 @@ class ModelTrainerBuilder:
         cls,
         estimator: object,
         dataset: Union[DataFrame, pd.DataFrame],
-        input_cols: Optional[List[str]] = None,
+        input_cols: List[str],
         autogenerated: bool = False,
         subproject: str = "",
     ) -> ModelTrainer:
         """
         Builder method that creates an appropriate ModelTrainer instance based on the given params.
         """
-        if input_cols is None:
-            raise exceptions.SnowflakeMLException(
-                error_code=error_codes.NOT_FOUND,
-                original_exception=ValueError(
-                    "The input column names (input_cols) is None.\n"
-                    "Please put your input_cols when initializing the estimator\n"
-                ),
-            )
         if isinstance(dataset, pd.DataFrame):
             return PandasModelTrainer(
                 estimator=estimator,
@@ -179,3 +171,44 @@ class ModelTrainerBuilder:
                 f"Unexpected dataset type: {type(dataset)}."
                 "Supported dataset types: snowpark.DataFrame, pandas.DataFrame."
             )
+    @classmethod
+    def build_fit_transform(
+        cls,
+        estimator: object,
+        dataset: Union[DataFrame, pd.DataFrame],
+        input_cols: List[str],
+        label_cols: Optional[List[str]] = None,
+        sample_weight_col: Optional[str] = None,
+        autogenerated: bool = False,
+        subproject: str = "",
+    ) -> ModelTrainer:
+        """
+        Builder method that creates an appropriate ModelTrainer instance based on the given params.
+        """
+        if isinstance(dataset, pd.DataFrame):
+            return PandasModelTrainer(
+                estimator=estimator,
+                dataset=dataset,
+                input_cols=input_cols,
+                label_cols=label_cols,
+                sample_weight_col=sample_weight_col,
+            )
+        elif isinstance(dataset, DataFrame):
+            trainer_klass = SnowparkModelTrainer
+            init_args = {
+                "estimator": estimator,
+                "dataset": dataset,
+                "session": dataset._session,
+                "input_cols": input_cols,
+                "label_cols": label_cols,
+                "sample_weight_col": sample_weight_col,
+                "autogenerated": autogenerated,
+                "subproject": subproject,
+            }
+            return trainer_klass(**init_args)  # type: ignore[arg-type]
+        else:
+            raise TypeError(
+                f"Unexpected dataset type: {type(dataset)}."
+                "Supported dataset types: snowpark.DataFrame, pandas.DataFrame."
+            )

snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py CHANGED Viewed

@@ -955,22 +955,21 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
                     X, y, indices, params_to_evaluate, base_estimator, fit_and_score_kwargs = _load_data_into_udf()
                     self.X = X
                     self.y = y
-                    self.indices = indices
+                    self.test_indices = indices
                     self.params_to_evaluate = params_to_evaluate
                     self.base_estimator = base_estimator
                     self.fit_and_score_kwargs = fit_and_score_kwargs
                     self.fit_score_params: List[Any] = []
+                    self.cached_train_test_indices = []
+                    # Calculate the full index here to avoid duplicate calculation (which consumes a lot of memory)
+                    full_index = np.arange(DATA_LENGTH)
+                    for i in range(n_splits):
+                        self.cached_train_test_indices.extend(
+                            [[np.setdiff1d(full_index, self.test_indices[i]), self.test_indices[i]]]
+                        )
                 def process(self, idx: int, params_idx: int, cv_idx: int) -> None:
-                    # 1. Calculate the parameter list
-                    parameters = self.params_to_evaluate[params_idx]
-                    # 2. Calculate the cross validator indices
-                    # cross validator's indices: we stored test indices only (to save space);
-                    # use the full index to re-construct each train index back.
-                    full_index = np.array([i for i in range(DATA_LENGTH)])
-                    test_index = self.indices[cv_idx]
-                    train_index = np.setdiff1d(full_index, test_index)
-                    self.fit_score_params.extend([[idx, (params_idx, parameters), (cv_idx, (train_index, test_index))]])
+                    self.fit_score_params.extend([[idx, params_idx, cv_idx]])
                 def end_partition(self) -> Iterator[Tuple[int, str]]:
                     from sklearn.base import clone
@@ -984,14 +983,14 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
                             clone(self.base_estimator),
                             self.X,
                             self.y,
-                            train=train,
-                            test=test,
-                            parameters=parameters,
+                            train=self.cached_train_test_indices[split_idx][0],
+                            test=self.cached_train_test_indices[split_idx][1],
+                            parameters=self.params_to_evaluate[cand_idx],
                             split_progress=(split_idx, n_splits),
                             candidate_progress=(cand_idx, n_candidates),
                             **self.fit_and_score_kwargs,  # load sample weight here
                         )
-                        for _, (cand_idx, parameters), (split_idx, (train, test)) in self.fit_score_params
+                        for _, cand_idx, split_idx in self.fit_score_params
                     )
                     binary_cv_results = None

snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py CHANGED Viewed

@@ -9,7 +9,11 @@ import cloudpickle as cp
 import pandas as pd
 from snowflake.ml._internal import telemetry
-from snowflake.ml._internal.utils import identifier, snowpark_dataframe_utils
+from snowflake.ml._internal.utils import (
+    identifier,
+    pkg_version_utils,
+    snowpark_dataframe_utils,
+)
 from snowflake.ml._internal.utils.query_result_checker import SqlResultValidator
 from snowflake.ml._internal.utils.temp_file_utils import (
     cleanup_temp_files,
@@ -91,6 +95,7 @@ class SnowparkTransformHandlers:
             A new dataset of the same type as the input dataset.
         """
+        dependencies = self._get_validated_snowpark_dependencies(session, dependencies)
         dataset = self.dataset
         estimator = self.estimator
         # Register vectorized UDF for batch inference
@@ -136,7 +141,7 @@ class SnowparkTransformHandlers:
                 estimator.n_jobs = 1
             inference_res = getattr(estimator, inference_method)(input_df, *args, **kwargs)
-            transformed_numpy_array, output_cols = handle_inference_result(
+            transformed_numpy_array, _ = handle_inference_result(
                 inference_res=inference_res,
                 output_cols=expected_output_cols,
                 inference_method=inference_method,
@@ -144,13 +149,13 @@ class SnowparkTransformHandlers:
             )
             if len(transformed_numpy_array.shape) > 1:
-                if transformed_numpy_array.shape[1] != len(output_cols):
+                if transformed_numpy_array.shape[1] != len(expected_output_cols):
                     series = pd.Series(transformed_numpy_array.tolist())
-                    transformed_pandas_df = pd.DataFrame(series, columns=output_cols)
+                    transformed_pandas_df = pd.DataFrame(series, columns=expected_output_cols)
                 else:
-                    transformed_pandas_df = pd.DataFrame(transformed_numpy_array.tolist(), columns=output_cols)
+                    transformed_pandas_df = pd.DataFrame(transformed_numpy_array.tolist(), columns=expected_output_cols)
             else:
-                transformed_pandas_df = pd.DataFrame(transformed_numpy_array, columns=output_cols)
+                transformed_pandas_df = pd.DataFrame(transformed_numpy_array, columns=expected_output_cols)
             return transformed_pandas_df.to_dict("records")  # type: ignore[no-any-return]
@@ -210,7 +215,8 @@ class SnowparkTransformHandlers:
         Returns:
             An accuracy score for the model on the given test data.
         """
+        dependencies = self._get_validated_snowpark_dependencies(session, dependencies)
+        dependencies.append("snowflake-snowpark-python")
         dataset = self.dataset
         estimator = self.estimator
         dataset = snowpark_dataframe_utils.cast_snowpark_dataframe_column_types(dataset)
@@ -335,3 +341,19 @@ class SnowparkTransformHandlers:
         cleanup_temp_files([local_score_file_name])
         return score
+    def _get_validated_snowpark_dependencies(self, session: Session, dependencies: List[str]) -> List[str]:
+        """A helper function to validate dependencies and return the available packages that exists
+        in the snowflake anaconda channel
+        Args:
+            session: the active snowpark Session
+            dependencies: unvalidated dependencies
+        Returns:
+            A list of packages present in the snoflake conda channel.
+        """
+        return pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel(
+            pkg_versions=dependencies, session=session, subproject=self._subproject
+        )

snowflake-ml-python 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

snowflake-ml-python 1.4.0py3-none-any.whl → 1.5.0py3-none-any.whl