PyPI - snowflake-ml-python - Versions diffs - 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl - Mend

snowflake-ml-python 1.4.0py3-none-any.whl → 1.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (234) hide show

snowflake/ml/_internal/env_utils.py +77 -32
snowflake/ml/_internal/exceptions/dataset_error_messages.py +5 -0
snowflake/ml/_internal/exceptions/dataset_errors.py +24 -0
snowflake/ml/_internal/exceptions/error_codes.py +3 -0
snowflake/ml/_internal/lineage/data_source.py +10 -0
snowflake/ml/_internal/lineage/dataset_dataframe.py +44 -0
snowflake/ml/_internal/utils/identifier.py +3 -1
snowflake/ml/_internal/utils/sql_identifier.py +2 -6
snowflake/ml/dataset/__init__.py +10 -0
snowflake/ml/dataset/dataset.py +454 -129
snowflake/ml/dataset/dataset_factory.py +53 -0
snowflake/ml/dataset/dataset_metadata.py +103 -0
snowflake/ml/dataset/dataset_reader.py +202 -0
snowflake/ml/feature_store/feature_store.py +531 -332
snowflake/ml/feature_store/feature_view.py +40 -23
snowflake/ml/fileset/embedded_stage_fs.py +146 -0
snowflake/ml/fileset/sfcfs.py +56 -54
snowflake/ml/fileset/snowfs.py +159 -0
snowflake/ml/fileset/stage_fs.py +49 -17
snowflake/ml/model/__init__.py +2 -2
snowflake/ml/model/_api.py +16 -1
snowflake/ml/model/_client/model/model_impl.py +27 -0
snowflake/ml/model/_client/model/model_version_impl.py +137 -50
snowflake/ml/model/_client/ops/model_ops.py +159 -40
snowflake/ml/model/_client/sql/model.py +25 -2
snowflake/ml/model/_client/sql/model_version.py +131 -2
snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +5 -1
snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +1 -0
snowflake/ml/model/_deploy_client/snowservice/deploy.py +2 -0
snowflake/ml/model/_deploy_client/utils/constants.py +0 -5
snowflake/ml/model/_deploy_client/utils/snowservice_client.py +21 -50
snowflake/ml/model/_model_composer/model_composer.py +22 -1
snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +38 -51
snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +19 -1
snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
snowflake/ml/model/_packager/model_env/model_env.py +41 -0
snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
snowflake/ml/model/_packager/model_meta/model_meta.py +37 -11
snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
snowflake/ml/model/_packager/model_packager.py +2 -5
snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
snowflake/ml/model/type_hints.py +21 -2
snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +55 -3
snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +34 -18
snowflake/ml/modeling/_internal/model_trainer.py +7 -0
snowflake/ml/modeling/_internal/model_trainer_builder.py +42 -9
snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +29 -7
snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +261 -16
snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +246 -175
snowflake/ml/modeling/cluster/affinity_propagation.py +246 -175
snowflake/ml/modeling/cluster/agglomerative_clustering.py +246 -175
snowflake/ml/modeling/cluster/birch.py +248 -175
snowflake/ml/modeling/cluster/bisecting_k_means.py +248 -175
snowflake/ml/modeling/cluster/dbscan.py +246 -175
snowflake/ml/modeling/cluster/feature_agglomeration.py +248 -175
snowflake/ml/modeling/cluster/k_means.py +248 -175
snowflake/ml/modeling/cluster/mean_shift.py +246 -175
snowflake/ml/modeling/cluster/mini_batch_k_means.py +248 -175
snowflake/ml/modeling/cluster/optics.py +246 -175
snowflake/ml/modeling/cluster/spectral_biclustering.py +246 -175
snowflake/ml/modeling/cluster/spectral_clustering.py +246 -175
snowflake/ml/modeling/cluster/spectral_coclustering.py +246 -175
snowflake/ml/modeling/compose/column_transformer.py +248 -175
snowflake/ml/modeling/compose/transformed_target_regressor.py +246 -175
snowflake/ml/modeling/covariance/elliptic_envelope.py +246 -175
snowflake/ml/modeling/covariance/empirical_covariance.py +246 -175
snowflake/ml/modeling/covariance/graphical_lasso.py +246 -175
snowflake/ml/modeling/covariance/graphical_lasso_cv.py +246 -175
snowflake/ml/modeling/covariance/ledoit_wolf.py +246 -175
snowflake/ml/modeling/covariance/min_cov_det.py +246 -175
snowflake/ml/modeling/covariance/oas.py +246 -175
snowflake/ml/modeling/covariance/shrunk_covariance.py +246 -175
snowflake/ml/modeling/decomposition/dictionary_learning.py +248 -175
snowflake/ml/modeling/decomposition/factor_analysis.py +248 -175
snowflake/ml/modeling/decomposition/fast_ica.py +248 -175
snowflake/ml/modeling/decomposition/incremental_pca.py +248 -175
snowflake/ml/modeling/decomposition/kernel_pca.py +248 -175
snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +248 -175
snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +248 -175
snowflake/ml/modeling/decomposition/pca.py +248 -175
snowflake/ml/modeling/decomposition/sparse_pca.py +248 -175
snowflake/ml/modeling/decomposition/truncated_svd.py +248 -175
snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +248 -175
snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +246 -175
snowflake/ml/modeling/ensemble/ada_boost_classifier.py +246 -175
snowflake/ml/modeling/ensemble/ada_boost_regressor.py +246 -175
snowflake/ml/modeling/ensemble/bagging_classifier.py +246 -175
snowflake/ml/modeling/ensemble/bagging_regressor.py +246 -175
snowflake/ml/modeling/ensemble/extra_trees_classifier.py +246 -175
snowflake/ml/modeling/ensemble/extra_trees_regressor.py +246 -175
snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +246 -175
snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +246 -175
snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +246 -175
snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +246 -175
snowflake/ml/modeling/ensemble/isolation_forest.py +246 -175
snowflake/ml/modeling/ensemble/random_forest_classifier.py +246 -175
snowflake/ml/modeling/ensemble/random_forest_regressor.py +246 -175
snowflake/ml/modeling/ensemble/stacking_regressor.py +248 -175
snowflake/ml/modeling/ensemble/voting_classifier.py +248 -175
snowflake/ml/modeling/ensemble/voting_regressor.py +248 -175
snowflake/ml/modeling/feature_selection/generic_univariate_select.py +248 -175
snowflake/ml/modeling/feature_selection/select_fdr.py +248 -175
snowflake/ml/modeling/feature_selection/select_fpr.py +248 -175
snowflake/ml/modeling/feature_selection/select_fwe.py +248 -175
snowflake/ml/modeling/feature_selection/select_k_best.py +248 -175
snowflake/ml/modeling/feature_selection/select_percentile.py +248 -175
snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +248 -175
snowflake/ml/modeling/feature_selection/variance_threshold.py +248 -175
snowflake/ml/modeling/framework/_utils.py +8 -1
snowflake/ml/modeling/framework/base.py +72 -37
snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +246 -175
snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +246 -175
snowflake/ml/modeling/impute/iterative_imputer.py +248 -175
snowflake/ml/modeling/impute/knn_imputer.py +248 -175
snowflake/ml/modeling/impute/missing_indicator.py +248 -175
snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +248 -175
snowflake/ml/modeling/kernel_approximation/nystroem.py +248 -175
snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +248 -175
snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +248 -175
snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +248 -175
snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +246 -175
snowflake/ml/modeling/lightgbm/lgbm_classifier.py +246 -175
snowflake/ml/modeling/lightgbm/lgbm_regressor.py +246 -175
snowflake/ml/modeling/linear_model/ard_regression.py +246 -175
snowflake/ml/modeling/linear_model/bayesian_ridge.py +246 -175
snowflake/ml/modeling/linear_model/elastic_net.py +246 -175
snowflake/ml/modeling/linear_model/elastic_net_cv.py +246 -175
snowflake/ml/modeling/linear_model/gamma_regressor.py +246 -175
snowflake/ml/modeling/linear_model/huber_regressor.py +246 -175
snowflake/ml/modeling/linear_model/lars.py +246 -175
snowflake/ml/modeling/linear_model/lars_cv.py +246 -175
snowflake/ml/modeling/linear_model/lasso.py +246 -175
snowflake/ml/modeling/linear_model/lasso_cv.py +246 -175
snowflake/ml/modeling/linear_model/lasso_lars.py +246 -175
snowflake/ml/modeling/linear_model/lasso_lars_cv.py +246 -175
snowflake/ml/modeling/linear_model/lasso_lars_ic.py +246 -175
snowflake/ml/modeling/linear_model/linear_regression.py +246 -175
snowflake/ml/modeling/linear_model/logistic_regression.py +246 -175
snowflake/ml/modeling/linear_model/logistic_regression_cv.py +246 -175
snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +246 -175
snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +246 -175
snowflake/ml/modeling/linear_model/multi_task_lasso.py +246 -175
snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +246 -175
snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +246 -175
snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +246 -175
snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +246 -175
snowflake/ml/modeling/linear_model/perceptron.py +246 -175
snowflake/ml/modeling/linear_model/poisson_regressor.py +246 -175
snowflake/ml/modeling/linear_model/ransac_regressor.py +246 -175
snowflake/ml/modeling/linear_model/ridge.py +246 -175
snowflake/ml/modeling/linear_model/ridge_classifier.py +246 -175
snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +246 -175
snowflake/ml/modeling/linear_model/ridge_cv.py +246 -175
snowflake/ml/modeling/linear_model/sgd_classifier.py +246 -175
snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +246 -175
snowflake/ml/modeling/linear_model/sgd_regressor.py +246 -175
snowflake/ml/modeling/linear_model/theil_sen_regressor.py +246 -175
snowflake/ml/modeling/linear_model/tweedie_regressor.py +246 -175
snowflake/ml/modeling/manifold/isomap.py +248 -175
snowflake/ml/modeling/manifold/mds.py +248 -175
snowflake/ml/modeling/manifold/spectral_embedding.py +248 -175
snowflake/ml/modeling/manifold/tsne.py +248 -175
snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +246 -175
snowflake/ml/modeling/mixture/gaussian_mixture.py +246 -175
snowflake/ml/modeling/model_selection/grid_search_cv.py +63 -41
snowflake/ml/modeling/model_selection/randomized_search_cv.py +80 -38
snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +246 -175
snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +246 -175
snowflake/ml/modeling/multiclass/output_code_classifier.py +246 -175
snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +246 -175
snowflake/ml/modeling/naive_bayes/categorical_nb.py +246 -175
snowflake/ml/modeling/naive_bayes/complement_nb.py +246 -175
snowflake/ml/modeling/naive_bayes/gaussian_nb.py +246 -175
snowflake/ml/modeling/naive_bayes/multinomial_nb.py +246 -175
snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +246 -175
snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +246 -175
snowflake/ml/modeling/neighbors/kernel_density.py +246 -175
snowflake/ml/modeling/neighbors/local_outlier_factor.py +246 -175
snowflake/ml/modeling/neighbors/nearest_centroid.py +246 -175
snowflake/ml/modeling/neighbors/nearest_neighbors.py +246 -175
snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +248 -175
snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +246 -175
snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +246 -175
snowflake/ml/modeling/neural_network/bernoulli_rbm.py +248 -175
snowflake/ml/modeling/neural_network/mlp_classifier.py +246 -175
snowflake/ml/modeling/neural_network/mlp_regressor.py +246 -175
snowflake/ml/modeling/pipeline/pipeline.py +517 -35
snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
snowflake/ml/modeling/preprocessing/one_hot_encoder.py +13 -5
snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
snowflake/ml/modeling/preprocessing/polynomial_features.py +248 -175
snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
snowflake/ml/modeling/semi_supervised/label_propagation.py +246 -175
snowflake/ml/modeling/semi_supervised/label_spreading.py +246 -175
snowflake/ml/modeling/svm/linear_svc.py +246 -175
snowflake/ml/modeling/svm/linear_svr.py +246 -175
snowflake/ml/modeling/svm/nu_svc.py +246 -175
snowflake/ml/modeling/svm/nu_svr.py +246 -175
snowflake/ml/modeling/svm/svc.py +246 -175
snowflake/ml/modeling/svm/svr.py +246 -175
snowflake/ml/modeling/tree/decision_tree_classifier.py +246 -175
snowflake/ml/modeling/tree/decision_tree_regressor.py +246 -175
snowflake/ml/modeling/tree/extra_tree_classifier.py +246 -175
snowflake/ml/modeling/tree/extra_tree_regressor.py +246 -175
snowflake/ml/modeling/xgboost/xgb_classifier.py +246 -175
snowflake/ml/modeling/xgboost/xgb_regressor.py +246 -175
snowflake/ml/modeling/xgboost/xgbrf_classifier.py +246 -175
snowflake/ml/modeling/xgboost/xgbrf_regressor.py +246 -175
snowflake/ml/registry/model_registry.py +3 -149
snowflake/ml/registry/registry.py +1 -1
snowflake/ml/version.py +1 -1
{snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/METADATA +129 -57
snowflake_ml_python-1.5.0.dist-info/RECORD +380 -0
snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
snowflake/ml/registry/_artifact_manager.py +0 -156
snowflake/ml/registry/artifact.py +0 -46
snowflake_ml_python-1.4.0.dist-info/RECORD +0 -370
{snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/LICENSE.txt +0 -0
{snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/WHEEL +0 -0
{snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/top_level.txt +0 -0

snowflake/ml/dataset/dataset.py CHANGED Viewed

@@ -1,161 +1,486 @@
 import json
-import time
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional
+import warnings
+from datetime import datetime
+from typing import Any, Dict, List, Optional, Tuple, Union
-from snowflake.ml.registry.artifact import Artifact, ArtifactType
-from snowflake.snowpark import DataFrame, Session
+from snowflake import snowpark
+from snowflake.ml._internal import telemetry
+from snowflake.ml._internal.exceptions import (
+    dataset_error_messages,
+    dataset_errors,
+    error_codes,
+    exceptions as snowml_exceptions,
+)
+from snowflake.ml._internal.lineage import data_source
+from snowflake.ml._internal.utils import (
+    formatting,
+    identifier,
+    query_result_checker,
+    snowpark_dataframe_utils,
+)
+from snowflake.ml.dataset import dataset_metadata, dataset_reader
+from snowflake.snowpark import exceptions as snowpark_exceptions, functions
+_PROJECT = "Dataset"
+_TELEMETRY_STATEMENT_PARAMS = telemetry.get_function_usage_statement_params(_PROJECT)
+_METADATA_MAX_QUERY_LENGTH = 10000
+_DATASET_VERSION_NAME_COL = "version"
-def _get_val_or_null(val: Any) -> Any:
-    return val if val is not None else "null"
+class DatasetVersion:
+    """Represents a version of a Snowflake Dataset"""
-def _wrap_embedded_str(s: str) -> str:
-    s = s.replace("\\", "\\\\")
-    s = s.replace('"', '\\"')
-    return s
+    @telemetry.send_api_usage_telemetry(project=_PROJECT)
+    def __init__(
+        self,
+        dataset: "Dataset",
+        version: str,
+    ) -> None:
+        """Initialize a DatasetVersion object.
+        Args:
+            dataset: The parent Snowflake Dataset.
+            version: Dataset version name.
+        """
+        self._parent = dataset
+        self._version = version
+        self._session: snowpark.Session = self._parent._session
-DATASET_SCHEMA_VERSION = "1"
+        self._properties: Optional[Dict[str, Any]] = None
+        self._raw_metadata: Optional[Dict[str, Any]] = None
+        self._metadata: Optional[dataset_metadata.DatasetMetadata] = None
+    @property
+    def name(self) -> str:
+        return self._version
-@dataclass(frozen=True)
-class FeatureStoreMetadata:
-    """
-    Feature store metadata.
+    @property
+    def created_on(self) -> datetime:
+        timestamp = self._get_property("created_on")
+        assert isinstance(timestamp, datetime)
+        return timestamp
-    Properties:
-        spine_query: The input query on source table which will be joined with features.
-        connection_params: a config contains feature store metadata.
-        features: A list of feature serialized object in the feature store.
+    @property
+    def comment(self) -> Optional[str]:
+        comment: Optional[str] = self._get_property("comment")
+        return comment
-    """
+    def _get_property(self, property_name: str, default: Any = None) -> Any:
+        if self._properties is None:
+            sql_result = (
+                query_result_checker.SqlResultValidator(
+                    self._session,
+                    f"SHOW VERSIONS LIKE '{self._version}' IN DATASET {self._parent.fully_qualified_name}",
+                    statement_params=_TELEMETRY_STATEMENT_PARAMS,
+                )
+                .has_dimensions(expected_rows=1)
+                .validate()
+            )
+            self._properties = sql_result[0].as_dict(True)
+        return self._properties.get(property_name, default)
+    def _get_metadata(self) -> Optional[dataset_metadata.DatasetMetadata]:
+        if self._raw_metadata is None:
+            self._raw_metadata = json.loads(self._get_property("metadata", "{}"))
+            try:
+                self._metadata = (
+                    dataset_metadata.DatasetMetadata.from_json(self._raw_metadata) if self._raw_metadata else None
+                )
+            except ValueError as e:
+                warnings.warn(f"Metadata parsing failed with error: {e}", UserWarning, stacklevel=2)
+        return self._metadata
-    spine_query: str
-    connection_params: Dict[str, str]
-    features: List[str]
+    def _get_exclude_cols(self) -> List[str]:
+        metadata = self._get_metadata()
+        if metadata is None:
+            return []
+        cols = []
+        if metadata.exclude_cols:
+            cols.extend(metadata.exclude_cols)
+        if metadata.label_cols:
+            cols.extend(metadata.label_cols)
+        return cols
-    def to_json(self) -> str:
-        state_dict = {
-            # TODO(zhe): Additional wrap is needed because ml_.artifact.ad_artifact takes a dict
-            # but we retrieve it as an object. Snowpark serialization is inconsistent with
-            # our deserialization. A fix is let artifact table stores string and callers
-            # handles both serialization and deserialization.
-            "spine_query": self.spine_query,
-            "connection_params": json.dumps(self.connection_params),
-            "features": json.dumps(self.features),
-        }
-        return json.dumps(state_dict)
+    def url(self) -> str:
+        """Returns the URL of the DatasetVersion contents in Snowflake.
+        Returns:
+            Snowflake URL string.
+        """
+        path = f"snow://dataset/{self._parent.fully_qualified_name}/versions/{self._version}/"
+        return path
-    @classmethod
-    def from_json(cls, json_str: str) -> "FeatureStoreMetadata":
-        json_dict = json.loads(json_str)
-        return cls(
-            spine_query=json_dict["spine_query"],
-            connection_params=json.loads(json_dict["connection_params"]),
-            features=json.loads(json_dict["features"]),
+    @telemetry.send_api_usage_telemetry(project=_PROJECT)
+    def list_files(self, subdir: Optional[str] = None) -> List[snowpark.Row]:
+        """Get the list of remote file paths for the current DatasetVersion."""
+        return self._session.sql(f"LIST {self.url()}{subdir or ''}").collect(
+            statement_params=_TELEMETRY_STATEMENT_PARAMS
         )
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(dataset='{self._parent.fully_qualified_name}', version='{self.name}')"
-class Dataset(Artifact):
-    """Metadata of dataset."""
+class Dataset:
+    """Represents a Snowflake Dataset which is organized into versions."""
+    @telemetry.send_api_usage_telemetry(project=_PROJECT)
     def __init__(
         self,
-        session: Session,
-        df: DataFrame,
-        generation_timestamp: Optional[float] = None,
-        materialized_table: Optional[str] = None,
-        snapshot_table: Optional[str] = None,
-        timestamp_col: Optional[str] = None,
-        label_cols: Optional[List[str]] = None,
-        feature_store_metadata: Optional[FeatureStoreMetadata] = None,
-        desc: str = "",
+        session: snowpark.Session,
+        database: str,
+        schema: str,
+        name: str,
+        selected_version: Optional[str] = None,
     ) -> None:
-        """Initialize dataset object.
+        """Initialize a lazily evaluated Dataset object"""
+        self._session = session
+        self._db = database
+        self._schema = schema
+        self._name = name
+        self._fully_qualified_name = identifier.get_schema_level_object_identifier(database, schema, name)
+        self._version = DatasetVersion(self, selected_version) if selected_version else None
+        self._reader: Optional[dataset_reader.DatasetReader] = None
+    @property
+    def fully_qualified_name(self) -> str:
+        return self._fully_qualified_name
+    @property
+    def selected_version(self) -> Optional[DatasetVersion]:
+        return self._version
+    @property
+    def read(self) -> dataset_reader.DatasetReader:
+        if not self.selected_version:
+            raise snowml_exceptions.SnowflakeMLException(
+                error_code=error_codes.INVALID_ATTRIBUTE,
+                original_exception=RuntimeError("No Dataset version selected."),
+            )
+        if self._reader is None:
+            v = self.selected_version
+            self._reader = dataset_reader.DatasetReader(
+                self._session,
+                [
+                    data_source.DataSource(
+                        fully_qualified_name=self._fully_qualified_name,
+                        version=v.name,
+                        url=v.url(),
+                        exclude_cols=v._get_exclude_cols(),
+                    )
+                ],
+            )
+        return self._reader
+    @staticmethod
+    @telemetry.send_api_usage_telemetry(project=_PROJECT)
+    def load(session: snowpark.Session, name: str) -> "Dataset":
+        """
+        Load an existing Snowflake Dataset. DatasetVersions can be created from the Dataset object
+        using `Dataset.create_version()` and loaded with `Dataset.version()`.
         Args:
-            session: An active snowpark session.
-            df: A dataframe object representing the dataset generation.
-            generation_timestamp: The timestamp when this dataset is generated. It will use current time if
-                not provided.
-            materialized_table: The destination table name which data will writes into.
-            snapshot_table: A snapshot table name on the materialized table.
-            timestamp_col: Timestamp column which was used for point-in-time correct feature lookup.
-            label_cols: Name of column(s) in materialized_table that contains labels.
-            feature_store_metadata: A feature store metadata object.
-            desc: A description about this dataset.
+            session: Snowpark Session to interact with Snowflake backend.
+            name: Name of dataset to load. May optionally be a schema-level identifier.
+        Returns:
+            Dataset object representing loaded dataset
+        Raises:
+            ValueError: name is not a valid Snowflake identifier
+            DatasetNotExistError: Specified Dataset does not exist
+        # noqa: DAR402
         """
-        self.df = df
-        self.generation_timestamp = generation_timestamp if generation_timestamp is not None else time.time()
-        self.materialized_table = materialized_table
-        self.snapshot_table = snapshot_table
-        self.timestamp_col = timestamp_col
-        self.label_cols = label_cols
-        self.feature_store_metadata = feature_store_metadata
-        self.desc = desc
-        self.owner = session.sql("SELECT CURRENT_USER()").collect()[0]["CURRENT_USER()"]
-        self.schema_version = DATASET_SCHEMA_VERSION
-        super().__init__(type=ArtifactType.DATASET, spec=self.to_json())
-    def load_features(self) -> Optional[List[str]]:
-        if self.feature_store_metadata is not None:
-            return self.feature_store_metadata.features
-        else:
-            return None
-    def features_df(self) -> DataFrame:
-        result = self.df
-        if self.timestamp_col is not None:
-            result = result.drop(self.timestamp_col)
-        if self.label_cols is not None:
-            result = result.drop(self.label_cols)
-        return result
-    def to_json(self) -> str:
-        if len(self.df.queries["queries"]) != 1:
-            raise ValueError(
-                f"""df dataframe must contain only 1 query.
-Got {len(self.df.queries['queries'])}: {self.df.queries['queries']}
-"""
+        db, schema, ds_name = _get_schema_level_identifier(session, name)
+        _validate_dataset_exists(session, db, schema, ds_name)
+        return Dataset(session, db, schema, ds_name)
+    @staticmethod
+    @telemetry.send_api_usage_telemetry(project=_PROJECT)
+    def create(session: snowpark.Session, name: str, exist_ok: bool = False) -> "Dataset":
+        """
+        Create a new Snowflake Dataset. DatasetVersions can created from the Dataset object
+        using `Dataset.create_version()` and loaded with `Dataset.version()`.
+        Args:
+            session: Snowpark Session to interact with Snowflake backend.
+            name: Name of dataset to create. May optionally be a schema-level identifier.
+            exist_ok: If False, raises an exception if specified Dataset already exists
+        Returns:
+            Dataset object representing created dataset
+        Raises:
+            ValueError: name is not a valid Snowflake identifier
+            DatasetExistError: Specified Dataset already exists
+            DatasetError: Dataset creation failed
+        # noqa: DAR401
+        # noqa: DAR402
+        """
+        db, schema, ds_name = _get_schema_level_identifier(session, name)
+        ds_fqn = identifier.get_schema_level_object_identifier(db, schema, ds_name)
+        query = f"CREATE DATASET{' IF NOT EXISTS' if exist_ok else ''} {ds_fqn}"
+        try:
+            session.sql(query).collect(statement_params=_TELEMETRY_STATEMENT_PARAMS)
+            return Dataset(session, db, schema, ds_name)
+        except snowpark_exceptions.SnowparkClientException as e:
+            # Snowpark wraps the Python Connector error code in the head of the error message.
+            if e.message.startswith(dataset_errors.ERRNO_OBJECT_ALREADY_EXISTS):
+                raise snowml_exceptions.SnowflakeMLException(
+                    error_code=error_codes.OBJECT_ALREADY_EXISTS,
+                    original_exception=dataset_errors.DatasetExistError(
+                        dataset_error_messages.DATASET_ALREADY_EXISTS.format(name)
+                    ),
+                ) from e
+            else:
+                raise
+    @telemetry.send_api_usage_telemetry(project=_PROJECT)
+    def list_versions(self, detailed: bool = False) -> Union[List[str], List[snowpark.Row]]:
+        """Return list of versions"""
+        versions = self._list_versions()
+        versions.sort(key=lambda r: r[_DATASET_VERSION_NAME_COL])
+        if not detailed:
+            return [r[_DATASET_VERSION_NAME_COL] for r in versions]
+        return versions
+    @telemetry.send_api_usage_telemetry(project=_PROJECT)
+    def select_version(self, version: str) -> "Dataset":
+        """Return a new Dataset instance with the specified version selected.
+        Args:
+            version: Dataset version name.
+        Returns:
+            Dataset object.
+        """
+        self._validate_version_exists(version)
+        return Dataset(self._session, self._db, self._schema, self._name, version)
+    @telemetry.send_api_usage_telemetry(project=_PROJECT)
+    def create_version(
+        self,
+        version: str,
+        input_dataframe: snowpark.DataFrame,
+        shuffle: bool = False,
+        exclude_cols: Optional[List[str]] = None,
+        label_cols: Optional[List[str]] = None,
+        properties: Optional[dataset_metadata.DatasetPropertiesType] = None,
+        partition_by: Optional[str] = None,
+        comment: Optional[str] = None,
+    ) -> "Dataset":
+        """Create a new version of the current Dataset.
+        The result Dataset object captures the query result deterministically as stage files.
+        Args:
+            version: Dataset version name. Data contents are materialized to the Dataset entity.
+            input_dataframe: A Snowpark DataFrame which yields the Dataset contents.
+            shuffle: A boolean represents whether the data should be shuffled globally. Default to be false.
+            exclude_cols: Name of column(s) in dataset to be excluded during training/testing (e.g. timestamp).
+            label_cols: Name of column(s) in dataset that contains labels.
+            properties: Custom metadata properties, saved under `DatasetMetadata.properties`
+            partition_by: Optional partitioning scheme within the new Dataset version.
+            comment: A descriptive comment about this dataset.
+        Returns:
+            A Dataset object with the newly created version selected.
+        Raises:
+            SnowflakeMLException: The Dataset no longer exists.
+            SnowflakeMLException: The specified Dataset version already exists.
+            snowpark_exceptions.SnowparkClientException: An error occurred during Dataset creation.
+        Note: During the generation of stage files, data casting will occur. The casting rules are as follows::
+            - Data casting:
+                - DecimalType(NUMBER):
+                    - If its scale is zero, cast to BIGINT
+                    - If its scale is non-zero, cast to FLOAT
+                - DoubleType(DOUBLE): Cast to FLOAT.
+                - ByteType(TINYINT): Cast to SMALLINT.
+                - ShortType(SMALLINT):Cast to SMALLINT.
+                - IntegerType(INT): Cast to INT.
+                - LongType(BIGINT): Cast to BIGINT.
+            - No action:
+                - FloatType(FLOAT): No action.
+                - StringType(String): No action.
+                - BinaryType(BINARY): No action.
+                - BooleanType(BOOLEAN): No action.
+            - Not supported:
+                - ArrayType(ARRAY): Not supported. A warning will be logged.
+                - MapType(OBJECT): Not supported. A warning will be logged.
+                - TimestampType(TIMESTAMP): Not supported. A warning will be logged.
+                - TimeType(TIME): Not supported. A warning will be logged.
+                - DateType(DATE): Not supported. A warning will be logged.
+                - VariantType(VARIANT): Not supported. A warning will be logged.
+        """
+        casted_df = snowpark_dataframe_utils.cast_snowpark_dataframe(input_dataframe)
+        if shuffle:
+            casted_df = casted_df.order_by(functions.random())
+        source_query = json.dumps(input_dataframe.queries)
+        if len(source_query) > _METADATA_MAX_QUERY_LENGTH:
+            warnings.warn(
+                "Source query exceeded max query length, dropping from metadata (limit=%d, actual=%d)"
+                % (_METADATA_MAX_QUERY_LENGTH, len(source_query)),
+                stacklevel=2,
             )
+            source_query = "<query too long>"
-        state_dict = {
-            "df_query": _wrap_embedded_str(self.df.queries["queries"][0]),
-            "generation_timestamp": self.generation_timestamp,
-            "owner": self.owner,
-            "materialized_table": _wrap_embedded_str(_get_val_or_null(self.materialized_table)),
-            "snapshot_table": _wrap_embedded_str(_get_val_or_null(self.snapshot_table)),
-            "timestamp_col": _wrap_embedded_str(_get_val_or_null(self.timestamp_col)),
-            "label_cols": _get_val_or_null(self.label_cols),
-            "feature_store_metadata": _wrap_embedded_str(self.feature_store_metadata.to_json())
-            if self.feature_store_metadata is not None
-            else "null",
-            "schema_version": self.schema_version,
-            "desc": self.desc,
-        }
-        return json.dumps(state_dict)
-    @classmethod
-    def from_json(cls, json_str: str, session: Session) -> "Dataset":
-        json_dict = json.loads(json_str, strict=False)
-        json_dict["df"] = session.sql(json_dict.pop("df_query"))
-        fs_meta_json = json_dict["feature_store_metadata"]
-        json_dict["feature_store_metadata"] = (
-            FeatureStoreMetadata.from_json(fs_meta_json) if fs_meta_json != "null" else None
+        metadata = dataset_metadata.DatasetMetadata(
+            source_query=source_query,
+            owner=self._session.sql("SELECT CURRENT_USER()").collect(statement_params=_TELEMETRY_STATEMENT_PARAMS)[0][
+                "CURRENT_USER()"
+            ],
+            exclude_cols=exclude_cols,
+            label_cols=label_cols,
+            properties=properties,
         )
-        schema_version = json_dict.pop("schema_version")
-        owner = json_dict.pop("owner")
+        post_actions = casted_df._plan.post_actions
+        try:
+            # Execute all but the last query, final query gets passed to ALTER DATASET ADD VERSION
+            query = casted_df._plan.queries[-1].sql.strip()
+            if len(casted_df._plan.queries) > 1:
+                casted_df._plan.queries = casted_df._plan.queries[:-1]
+                casted_df._plan.post_actions = []
+                casted_df.collect(statement_params=_TELEMETRY_STATEMENT_PARAMS)
+            sql_command = "ALTER DATASET {} ADD VERSION '{}' FROM ({})".format(
+                self.fully_qualified_name,
+                version,
+                query,
+            )
+            if partition_by:
+                sql_command += f" PARTITION BY {partition_by}"
+            if comment:
+                sql_command += f" COMMENT={formatting.format_value_for_select(comment)}"
+            sql_command += f" METADATA=$${metadata.to_json()}$$"
+            self._session.sql(sql_command).collect(statement_params=_TELEMETRY_STATEMENT_PARAMS)
+            return Dataset(self._session, self._db, self._schema, self._name, version)
-        result = cls(session, **json_dict)
-        result.schema_version = schema_version
-        result.owner = owner
+        except snowpark_exceptions.SnowparkClientException as e:
+            if e.message.startswith(dataset_errors.ERRNO_DATASET_NOT_EXIST):
+                raise snowml_exceptions.SnowflakeMLException(
+                    error_code=error_codes.NOT_FOUND,
+                    original_exception=dataset_errors.DatasetNotExistError(
+                        dataset_error_messages.DATASET_NOT_EXIST.format(self.fully_qualified_name)
+                    ),
+                ) from e
+            elif (
+                e.message.startswith(dataset_errors.ERRNO_DATASET_VERSION_ALREADY_EXISTS)
+                or e.message.startswith(dataset_errors.ERRNO_VERSION_ALREADY_EXISTS)
+                or e.message.startswith(dataset_errors.ERRNO_FILES_ALREADY_EXISTING)
+            ):
+                raise snowml_exceptions.SnowflakeMLException(
+                    error_code=error_codes.OBJECT_ALREADY_EXISTS,
+                    original_exception=dataset_errors.DatasetExistError(
+                        dataset_error_messages.DATASET_VERSION_ALREADY_EXISTS.format(self.fully_qualified_name, version)
+                    ),
+                ) from e
+            else:
+                raise
+        finally:
+            for action in post_actions:
+                self._session.sql(action.sql.strip()).collect(statement_params=_TELEMETRY_STATEMENT_PARAMS)
-        return result
+    @telemetry.send_api_usage_telemetry(project=_PROJECT)
+    def delete_version(self, version_name: str) -> None:
+        """Delete the Dataset version
-    def __eq__(self, other: object) -> bool:
-        return isinstance(other, Dataset) and self.to_json() == other.to_json()
+        Args:
+            version_name: Name of version to delete from Dataset
+        Raises:
+            SnowflakeMLException: An error occurred when the DatasetVersion cannot get deleted.
+        """
+        delete_sql = f"ALTER DATASET {self.fully_qualified_name} DROP VERSION '{version_name}'"
+        try:
+            self._session.sql(delete_sql).collect(
+                statement_params=_TELEMETRY_STATEMENT_PARAMS,
+            )
+        except snowpark_exceptions.SnowparkClientException as e:
+            raise snowml_exceptions.SnowflakeMLException(
+                error_code=error_codes.SNOWML_DELETE_FAILED,
+                original_exception=dataset_errors.DatasetCannotDeleteError(str(e)),
+            ) from e
+        return
+    @telemetry.send_api_usage_telemetry(project=_PROJECT)
+    def delete(self) -> None:
+        """Delete Dataset and all contained versions"""
+        # TODO: Check and warn if any versions exist
+        self._session.sql(f"DROP DATASET {self.fully_qualified_name}").collect(
+            statement_params=_TELEMETRY_STATEMENT_PARAMS
+        )
+    def _list_versions(self, pattern: Optional[str] = None) -> List[snowpark.Row]:
+        """Return list of versions"""
+        try:
+            pattern_clause = f" LIKE '{pattern}'" if pattern else ""
+            return (
+                query_result_checker.SqlResultValidator(
+                    self._session,
+                    f"SHOW VERSIONS{pattern_clause} IN DATASET {self.fully_qualified_name}",
+                    statement_params=_TELEMETRY_STATEMENT_PARAMS,
+                )
+                .has_column(_DATASET_VERSION_NAME_COL, allow_empty=True)
+                .validate()
+            )
+        except snowpark_exceptions.SnowparkClientException as e:
+            # Snowpark wraps the Python Connector error code in the head of the error message.
+            if e.message.startswith(dataset_errors.ERRNO_OBJECT_NOT_EXIST):
+                raise snowml_exceptions.SnowflakeMLException(
+                    error_code=error_codes.NOT_FOUND,
+                    original_exception=dataset_errors.DatasetNotExistError(
+                        dataset_error_messages.DATASET_NOT_EXIST.format(self.fully_qualified_name)
+                    ),
+                ) from e
+            else:
+                raise
+    def _validate_version_exists(self, version: str) -> None:
+        """Verify that the requested version exists. Raises DatasetNotExist if version not found"""
+        matches = self._list_versions(version)
+        matches = [m for m in matches if m[_DATASET_VERSION_NAME_COL] == version]  # Case sensitive match
+        if len(matches) == 0:
+            raise snowml_exceptions.SnowflakeMLException(
+                error_code=error_codes.NOT_FOUND,
+                original_exception=dataset_errors.DatasetNotExistError(
+                    dataset_error_messages.DATASET_VERSION_NOT_EXIST.format(self.fully_qualified_name, version)
+                ),
+            )
+# Utility methods
+def _get_schema_level_identifier(session: snowpark.Session, dataset_name: str) -> Tuple[str, str, str]:
+    """Resolve a dataset name into a validated schema-level location identifier"""
+    db, schema, object_name, others = identifier.parse_schema_level_object_identifier(dataset_name)
+    if others:
+        raise ValueError(f"Invalid identifier: unexpected '{others}'")
+    db = db or session.get_current_database()
+    schema = schema or session.get_current_schema()
+    return str(db), str(schema), str(object_name)
+def _validate_dataset_exists(session: snowpark.Session, db: str, schema: str, dataset_name: str) -> None:
+    # FIXME: Once we switch version to SQL Identifiers we can just use version check with version=''
+    dataset_name = identifier.resolve_identifier(dataset_name)
+    if len(dataset_name) > 0 and dataset_name[0] == '"' and dataset_name[-1] == '"':
+        dataset_name = identifier.get_unescaped_names(dataset_name)
+    # Case sensitive match
+    query = f"show datasets like '{dataset_name}' in schema {db}.{schema} starts with '{dataset_name}'"
+    ds_matches = session.sql(query).count()
+    if ds_matches == 0:
+        raise snowml_exceptions.SnowflakeMLException(
+            error_code=error_codes.NOT_FOUND,
+            original_exception=dataset_errors.DatasetNotExistError(
+                dataset_error_messages.DATASET_NOT_EXIST.format(dataset_name)
+            ),
+        )

snowflake/ml/dataset/dataset_factory.py ADDED Viewed

@@ -0,0 +1,53 @@
+from typing import Any
+from snowflake import snowpark
+from snowflake.ml._internal import telemetry
+from snowflake.ml.dataset import dataset
+_PROJECT = "Dataset"
+@telemetry.send_api_usage_telemetry(project=_PROJECT)
+def create_from_dataframe(
+    session: snowpark.Session,
+    name: str,
+    version: str,
+    input_dataframe: snowpark.DataFrame,
+    **version_kwargs: Any,
+) -> dataset.Dataset:
+    """
+    Create a new versioned Dataset from a DataFrame and returns
+    a DatasetReader for the newly created Dataset version.
+    Args:
+        session: The Snowpark Session instance to use.
+        name: The dataset name
+        version: The dataset version name
+        input_dataframe: DataFrame containing data to be saved to the created Dataset.
+        version_kwargs: Keyword arguments passed to dataset version creation.
+            See `Dataset.create_version()` documentation for supported arguments.
+    Returns:
+        A Dataset object.
+    """
+    ds: dataset.Dataset = dataset.Dataset.create(session, name, exist_ok=True)
+    ds.create_version(version, input_dataframe=input_dataframe, **version_kwargs)
+    ds = ds.select_version(version)  # select_version returns a new copy
+    return ds
+@telemetry.send_api_usage_telemetry(project=_PROJECT)
+def load_dataset(session: snowpark.Session, name: str, version: str) -> dataset.Dataset:
+    """
+    Load a versioned Dataset into a DatasetReader.
+    Args:
+        session: The Snowpark Session instance to use.
+        name: The dataset name.
+        version: The dataset version name.
+    Returns:
+        A DatasetReader object.
+    """
+    ds: dataset.Dataset = dataset.Dataset.load(session, name).select_version(version)
+    return ds

snowflake-ml-python 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

snowflake-ml-python 1.4.0py3-none-any.whl → 1.5.0py3-none-any.whl