PyPI - snowflake-ml-python - Versions diffs - 1.6.1__py3-none-any.whl → 1.6.2__py3-none-any.whl - Mend

snowflake-ml-python 1.6.1py3-none-any.whl → 1.6.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (212) hide show

snowflake/ml/_internal/telemetry.py +142 -20
snowflake/ml/_internal/utils/identifier.py +48 -11
snowflake/ml/_internal/utils/snowflake_env.py +23 -13
snowflake/ml/_internal/utils/sql_identifier.py +1 -1
snowflake/ml/_internal/utils/table_manager.py +19 -1
snowflake/ml/_internal/utils/uri.py +2 -2
snowflake/ml/data/data_connector.py +33 -7
snowflake/ml/data/torch_utils.py +68 -0
snowflake/ml/dataset/dataset.py +1 -3
snowflake/ml/feature_store/feature_store.py +41 -17
snowflake/ml/feature_store/feature_view.py +2 -2
snowflake/ml/fileset/embedded_stage_fs.py +1 -1
snowflake/ml/fileset/fileset.py +1 -1
snowflake/ml/fileset/sfcfs.py +9 -3
snowflake/ml/model/_client/model/model_version_impl.py +22 -7
snowflake/ml/model/_client/ops/model_ops.py +39 -3
snowflake/ml/model/_client/ops/service_ops.py +198 -7
snowflake/ml/model/_client/service/model_deployment_spec.py +4 -5
snowflake/ml/model/_client/service/model_deployment_spec_schema.py +1 -2
snowflake/ml/model/_client/sql/service.py +85 -18
snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +1 -1
snowflake/ml/model/_deploy_client/snowservice/deploy.py +3 -3
snowflake/ml/model/_model_composer/model_composer.py +2 -0
snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +3 -8
snowflake/ml/model/_packager/model_handlers/_utils.py +46 -14
snowflake/ml/model/_packager/model_handlers/catboost.py +17 -15
snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +23 -15
snowflake/ml/model/_packager/model_handlers/lightgbm.py +15 -57
snowflake/ml/model/_packager/model_handlers/llm.py +4 -2
snowflake/ml/model/_packager/model_handlers/model_objective_utils.py +116 -0
snowflake/ml/model/_packager/model_handlers/sklearn.py +36 -24
snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +119 -6
snowflake/ml/model/_packager/model_handlers/torchscript.py +2 -2
snowflake/ml/model/_packager/model_handlers/xgboost.py +48 -48
snowflake/ml/model/_packager/model_meta/model_meta.py +10 -7
snowflake/ml/model/_packager/model_meta/model_meta_schema.py +0 -8
snowflake/ml/model/_packager/model_packager.py +2 -0
snowflake/ml/model/_signatures/pytorch_handler.py +1 -1
snowflake/ml/model/_signatures/utils.py +9 -0
snowflake/ml/model/models/llm.py +3 -1
snowflake/ml/model/type_hints.py +9 -1
snowflake/ml/modeling/_internal/constants.py +1 -0
snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +5 -5
snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +9 -6
snowflake/ml/modeling/_internal/model_specifications.py +2 -0
snowflake/ml/modeling/_internal/model_trainer.py +1 -0
snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +2 -2
snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +113 -160
snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +60 -21
snowflake/ml/modeling/cluster/affinity_propagation.py +60 -21
snowflake/ml/modeling/cluster/agglomerative_clustering.py +60 -21
snowflake/ml/modeling/cluster/birch.py +60 -21
snowflake/ml/modeling/cluster/bisecting_k_means.py +60 -21
snowflake/ml/modeling/cluster/dbscan.py +60 -21
snowflake/ml/modeling/cluster/feature_agglomeration.py +60 -21
snowflake/ml/modeling/cluster/k_means.py +60 -21
snowflake/ml/modeling/cluster/mean_shift.py +60 -21
snowflake/ml/modeling/cluster/mini_batch_k_means.py +60 -21
snowflake/ml/modeling/cluster/optics.py +60 -21
snowflake/ml/modeling/cluster/spectral_biclustering.py +60 -21
snowflake/ml/modeling/cluster/spectral_clustering.py +60 -21
snowflake/ml/modeling/cluster/spectral_coclustering.py +60 -21
snowflake/ml/modeling/compose/column_transformer.py +60 -21
snowflake/ml/modeling/compose/transformed_target_regressor.py +60 -21
snowflake/ml/modeling/covariance/elliptic_envelope.py +60 -21
snowflake/ml/modeling/covariance/empirical_covariance.py +60 -21
snowflake/ml/modeling/covariance/graphical_lasso.py +60 -21
snowflake/ml/modeling/covariance/graphical_lasso_cv.py +60 -21
snowflake/ml/modeling/covariance/ledoit_wolf.py +60 -21
snowflake/ml/modeling/covariance/min_cov_det.py +60 -21
snowflake/ml/modeling/covariance/oas.py +60 -21
snowflake/ml/modeling/covariance/shrunk_covariance.py +60 -21
snowflake/ml/modeling/decomposition/dictionary_learning.py +60 -21
snowflake/ml/modeling/decomposition/factor_analysis.py +60 -21
snowflake/ml/modeling/decomposition/fast_ica.py +60 -21
snowflake/ml/modeling/decomposition/incremental_pca.py +60 -21
snowflake/ml/modeling/decomposition/kernel_pca.py +60 -21
snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +60 -21
snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +60 -21
snowflake/ml/modeling/decomposition/pca.py +60 -21
snowflake/ml/modeling/decomposition/sparse_pca.py +60 -21
snowflake/ml/modeling/decomposition/truncated_svd.py +60 -21
snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +60 -21
snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +60 -21
snowflake/ml/modeling/ensemble/ada_boost_classifier.py +60 -21
snowflake/ml/modeling/ensemble/ada_boost_regressor.py +60 -21
snowflake/ml/modeling/ensemble/bagging_classifier.py +60 -21
snowflake/ml/modeling/ensemble/bagging_regressor.py +60 -21
snowflake/ml/modeling/ensemble/extra_trees_classifier.py +60 -21
snowflake/ml/modeling/ensemble/extra_trees_regressor.py +60 -21
snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +60 -21
snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +60 -21
snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +60 -21
snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +60 -21
snowflake/ml/modeling/ensemble/isolation_forest.py +60 -21
snowflake/ml/modeling/ensemble/random_forest_classifier.py +60 -21
snowflake/ml/modeling/ensemble/random_forest_regressor.py +60 -21
snowflake/ml/modeling/ensemble/stacking_regressor.py +60 -21
snowflake/ml/modeling/ensemble/voting_classifier.py +60 -21
snowflake/ml/modeling/ensemble/voting_regressor.py +60 -21
snowflake/ml/modeling/feature_selection/generic_univariate_select.py +60 -21
snowflake/ml/modeling/feature_selection/select_fdr.py +60 -21
snowflake/ml/modeling/feature_selection/select_fpr.py +60 -21
snowflake/ml/modeling/feature_selection/select_fwe.py +60 -21
snowflake/ml/modeling/feature_selection/select_k_best.py +60 -21
snowflake/ml/modeling/feature_selection/select_percentile.py +60 -21
snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +60 -21
snowflake/ml/modeling/feature_selection/variance_threshold.py +60 -21
snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +60 -21
snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +60 -21
snowflake/ml/modeling/impute/iterative_imputer.py +60 -21
snowflake/ml/modeling/impute/knn_imputer.py +60 -21
snowflake/ml/modeling/impute/missing_indicator.py +60 -21
snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +60 -21
snowflake/ml/modeling/kernel_approximation/nystroem.py +60 -21
snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +60 -21
snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +60 -21
snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +60 -21
snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +60 -21
snowflake/ml/modeling/lightgbm/lgbm_classifier.py +60 -21
snowflake/ml/modeling/lightgbm/lgbm_regressor.py +60 -21
snowflake/ml/modeling/linear_model/ard_regression.py +60 -21
snowflake/ml/modeling/linear_model/bayesian_ridge.py +60 -21
snowflake/ml/modeling/linear_model/elastic_net.py +60 -21
snowflake/ml/modeling/linear_model/elastic_net_cv.py +60 -21
snowflake/ml/modeling/linear_model/gamma_regressor.py +60 -21
snowflake/ml/modeling/linear_model/huber_regressor.py +60 -21
snowflake/ml/modeling/linear_model/lars.py +60 -21
snowflake/ml/modeling/linear_model/lars_cv.py +60 -21
snowflake/ml/modeling/linear_model/lasso.py +60 -21
snowflake/ml/modeling/linear_model/lasso_cv.py +60 -21
snowflake/ml/modeling/linear_model/lasso_lars.py +60 -21
snowflake/ml/modeling/linear_model/lasso_lars_cv.py +60 -21
snowflake/ml/modeling/linear_model/lasso_lars_ic.py +60 -21
snowflake/ml/modeling/linear_model/linear_regression.py +60 -21
snowflake/ml/modeling/linear_model/logistic_regression.py +60 -21
snowflake/ml/modeling/linear_model/logistic_regression_cv.py +60 -21
snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +60 -21
snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +60 -21
snowflake/ml/modeling/linear_model/multi_task_lasso.py +60 -21
snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +60 -21
snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +60 -21
snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +60 -21
snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +60 -21
snowflake/ml/modeling/linear_model/perceptron.py +60 -21
snowflake/ml/modeling/linear_model/poisson_regressor.py +60 -21
snowflake/ml/modeling/linear_model/ransac_regressor.py +60 -21
snowflake/ml/modeling/linear_model/ridge.py +60 -21
snowflake/ml/modeling/linear_model/ridge_classifier.py +60 -21
snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +60 -21
snowflake/ml/modeling/linear_model/ridge_cv.py +60 -21
snowflake/ml/modeling/linear_model/sgd_classifier.py +60 -21
snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +60 -21
snowflake/ml/modeling/linear_model/sgd_regressor.py +60 -21
snowflake/ml/modeling/linear_model/theil_sen_regressor.py +60 -21
snowflake/ml/modeling/linear_model/tweedie_regressor.py +60 -21
snowflake/ml/modeling/manifold/isomap.py +60 -21
snowflake/ml/modeling/manifold/mds.py +60 -21
snowflake/ml/modeling/manifold/spectral_embedding.py +60 -21
snowflake/ml/modeling/manifold/tsne.py +60 -21
snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +60 -21
snowflake/ml/modeling/mixture/gaussian_mixture.py +60 -21
snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +60 -21
snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +60 -21
snowflake/ml/modeling/multiclass/output_code_classifier.py +60 -21
snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +60 -21
snowflake/ml/modeling/naive_bayes/categorical_nb.py +60 -21
snowflake/ml/modeling/naive_bayes/complement_nb.py +60 -21
snowflake/ml/modeling/naive_bayes/gaussian_nb.py +60 -21
snowflake/ml/modeling/naive_bayes/multinomial_nb.py +60 -21
snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +60 -21
snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +60 -21
snowflake/ml/modeling/neighbors/kernel_density.py +60 -21
snowflake/ml/modeling/neighbors/local_outlier_factor.py +60 -21
snowflake/ml/modeling/neighbors/nearest_centroid.py +60 -21
snowflake/ml/modeling/neighbors/nearest_neighbors.py +60 -21
snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +60 -21
snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +60 -21
snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +60 -21
snowflake/ml/modeling/neural_network/bernoulli_rbm.py +60 -21
snowflake/ml/modeling/neural_network/mlp_classifier.py +60 -21
snowflake/ml/modeling/neural_network/mlp_regressor.py +60 -21
snowflake/ml/modeling/parameters/disable_model_tracer.py +5 -0
snowflake/ml/modeling/pipeline/pipeline.py +1 -12
snowflake/ml/modeling/preprocessing/polynomial_features.py +60 -21
snowflake/ml/modeling/semi_supervised/label_propagation.py +60 -21
snowflake/ml/modeling/semi_supervised/label_spreading.py +60 -21
snowflake/ml/modeling/svm/linear_svc.py +60 -21
snowflake/ml/modeling/svm/linear_svr.py +60 -21
snowflake/ml/modeling/svm/nu_svc.py +60 -21
snowflake/ml/modeling/svm/nu_svr.py +60 -21
snowflake/ml/modeling/svm/svc.py +60 -21
snowflake/ml/modeling/svm/svr.py +60 -21
snowflake/ml/modeling/tree/decision_tree_classifier.py +60 -21
snowflake/ml/modeling/tree/decision_tree_regressor.py +60 -21
snowflake/ml/modeling/tree/extra_tree_classifier.py +60 -21
snowflake/ml/modeling/tree/extra_tree_regressor.py +60 -21
snowflake/ml/modeling/xgboost/xgb_classifier.py +63 -23
snowflake/ml/modeling/xgboost/xgb_regressor.py +63 -23
snowflake/ml/modeling/xgboost/xgbrf_classifier.py +63 -23
snowflake/ml/modeling/xgboost/xgbrf_regressor.py +63 -23
snowflake/ml/registry/_manager/model_manager.py +4 -0
snowflake/ml/registry/model_registry.py +1 -1
snowflake/ml/registry/registry.py +1 -2
snowflake/ml/version.py +1 -1
{snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.2.dist-info}/METADATA +23 -4
{snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.2.dist-info}/RECORD +211 -209
{snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.2.dist-info}/WHEEL +1 -1
snowflake/ml/data/torch_dataset.py +0 -33
{snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.2.dist-info}/LICENSE.txt +0 -0
{snowflake_ml_python-1.6.1.dist-info → snowflake_ml_python-1.6.2.dist-info}/top_level.txt +0 -0

snowflake/ml/_internal/telemetry.py CHANGED Viewed

@@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+import contextvars
 import enum
 import functools
 import inspect
@@ -12,6 +13,7 @@ from typing import (
     List,
     Mapping,
     Optional,
+    Set,
     Tuple,
     TypeVar,
     Union,
@@ -28,7 +30,7 @@ from snowflake.ml._internal.exceptions import (
     exceptions as snowml_exceptions,
 )
 from snowflake.snowpark import dataframe, exceptions as snowpark_exceptions, session
-from snowflake.snowpark._internal import utils
+from snowflake.snowpark._internal import server_connection, utils
 _log_counter = 0
 _FLUSH_SIZE = 10
@@ -85,6 +87,122 @@ class TelemetryField(enum.Enum):
     FUNC_CAT_USAGE = "usage"
+class _TelemetrySourceType(enum.Enum):
+    # Automatically inferred telemetry/statement parameters
+    AUTO_TELEMETRY = "SNOWML_AUTO_TELEMETRY"
+    # Mixture of manual and automatic telemetry/statement parameters
+    AUGMENT_TELEMETRY = "SNOWML_AUGMENT_TELEMETRY"
+_statement_params_context_var: contextvars.ContextVar[Dict[str, str]] = contextvars.ContextVar("statement_params")
+class _StatementParamsPatchManager:
+    def __init__(self) -> None:
+        self._patch_cache: Set[server_connection.ServerConnection] = set()
+        self._context_var: contextvars.ContextVar[Dict[str, str]] = _statement_params_context_var
+    def apply_patches(self) -> None:
+        try:
+            # Apply patching to all active sessions in case of multiple
+            for sess in session._get_active_sessions():
+                # Check patch cache here to avoid unnecessary context switches
+                if self._get_target(sess) not in self._patch_cache:
+                    self._patch_session(sess)
+        except snowpark_exceptions.SnowparkSessionException:
+            pass
+    def set_statement_params(self, statement_params: Dict[str, str]) -> None:
+        # Only set value if not already set in context
+        if not self._context_var.get({}):
+            self._context_var.set(statement_params)
+    def _get_target(self, session: session.Session) -> server_connection.ServerConnection:
+        return cast(server_connection.ServerConnection, session._conn)
+    def _patch_session(self, session: session.Session, throw_on_patch_fail: bool = False) -> None:
+        # Extract target
+        try:
+            target = self._get_target(session)
+        except AttributeError:
+            if throw_on_patch_fail:
+                raise
+            # TODO: Log a warning, this probably means there was a breaking change in Snowpark/SnowflakeConnection
+            return
+        # Check if session has already been patched
+        if target in self._patch_cache:
+            return
+        self._patch_cache.add(target)
+        functions = [
+            ("execute_and_notify_query_listener", "_statement_params"),
+            ("execute_async_and_notify_query_listener", "_statement_params"),
+        ]
+        for func, param_name in functions:
+            try:
+                self._patch_with_statement_params(target, func, param_name=param_name)
+            except AttributeError:
+                if throw_on_patch_fail:  # primarily used for testing
+                    raise
+                # TODO: Log a warning, this probably means there was a breaking change in Snowpark/SnowflakeConnection
+                pass
+    def _patch_with_statement_params(
+        self, target: object, function_name: str, param_name: str = "statement_params"
+    ) -> None:
+        func = getattr(target, function_name)
+        assert callable(func)
+        @functools.wraps(func)
+        def wrapper(*args: Any, **kwargs: Any) -> Any:
+            # Retrieve context level statement parameters
+            context_params = self._context_var.get(dict())
+            if not context_params:
+                # Exit early if not in SnowML (decorator) context
+                return func(*args, **kwargs)
+            # Extract any explicitly provided statement parameters
+            orig_kwargs = dict(kwargs)
+            in_params = kwargs.pop(param_name, None) or {}
+            # Inject a special flag to statement parameters so we can filter out these patched logs if necessary
+            # Calls that include SnowML telemetry are tagged with "SNOWML_AUGMENT_TELEMETRY"
+            # and calls without SnowML telemetry are tagged with "SNOWML_AUTO_TELEMETRY"
+            if TelemetryField.KEY_PROJECT.value in in_params:
+                context_params["snowml_telemetry_type"] = _TelemetrySourceType.AUGMENT_TELEMETRY.value
+            else:
+                context_params["snowml_telemetry_type"] = _TelemetrySourceType.AUTO_TELEMETRY.value
+            # Apply any explicitly provided statement parameters and result into function call
+            context_params.update(in_params)
+            kwargs[param_name] = context_params
+            try:
+                return func(*args, **kwargs)
+            except TypeError as e:
+                if str(e).endswith(f"unexpected keyword argument '{param_name}'"):
+                    # TODO: Log warning that this patch is invalid
+                    # Unwrap function for future invocations
+                    setattr(target, function_name, func)
+                    return func(*args, **orig_kwargs)
+                else:
+                    raise
+        setattr(target, function_name, wrapper)
+    def __getstate__(self) -> Dict[str, Any]:
+        return {}
+    def __setstate__(self, state: Dict[str, Any]) -> None:
+        # unpickling does not call __init__ by default, do it manually here
+        self.__init__()  # type: ignore[misc]
+_patch_manager = _StatementParamsPatchManager()
 def get_statement_params(
     project: str, subproject: Optional[str] = None, class_name: Optional[str] = None
 ) -> Dict[str, Any]:
@@ -375,7 +493,18 @@ def send_api_usage_telemetry(
                         obj._statement_params = statement_params  # type: ignore[assignment]
                 return obj
+            # Set up framework-level credit usage instrumentation
+            ctx = contextvars.copy_context()
+            _patch_manager.apply_patches()
+            # This function should be executed with ctx.run()
+            def execute_func_with_statement_params() -> _ReturnValue:
+                _patch_manager.set_statement_params(statement_params)
+                result = func(*args, **kwargs)
+                return update_stmt_params_if_snowpark_df(result, statement_params)
             # prioritize `conn_attr_name` over the active session
+            telemetry_enabled = True
             if conn_attr_name:
                 # raise AttributeError if conn attribute does not exist in `self`
                 conn = operator.attrgetter(conn_attr_name)(args[0])
@@ -387,22 +516,17 @@ def send_api_usage_telemetry(
             else:
                 try:
                     active_session = next(iter(session._get_active_sessions()))
-                # server no default session
+                    conn = active_session._conn._conn
+                    telemetry_enabled = active_session.telemetry_enabled
                 except snowpark_exceptions.SnowparkSessionException:
-                    try:
-                        return update_stmt_params_if_snowpark_df(func(*args, **kwargs), statement_params)
-                    except Exception as e:
-                        if isinstance(e, snowml_exceptions.SnowflakeMLException):
-                            raise e.original_exception.with_traceback(e.__traceback__) from None
-                        # suppress SnowparkSessionException from telemetry in the stack trace
-                        raise e from None
-                conn = active_session._conn._conn
-                if (not active_session.telemetry_enabled) or (conn is None):
-                    try:
-                        return update_stmt_params_if_snowpark_df(func(*args, **kwargs), statement_params)
-                    except snowml_exceptions.SnowflakeMLException as e:
-                        raise e.original_exception from e
+                    conn = None
+            if conn is None or not telemetry_enabled:
+                # Telemetry not enabled, just execute without our additional telemetry logic
+                try:
+                    return ctx.run(execute_func_with_statement_params)
+                except snowml_exceptions.SnowflakeMLException as e:
+                    raise e.original_exception from e
             # TODO(hayu): [SNOW-750287] Optimize telemetry client to a singleton.
             telemetry = _SourceTelemetryClient(conn=conn, project=project, subproject=subproject_name)
@@ -415,11 +539,11 @@ def send_api_usage_telemetry(
                 custom_tags=custom_tags,
             )
             try:
-                res = func(*args, **kwargs)
+                return ctx.run(execute_func_with_statement_params)
             except Exception as e:
                 if not isinstance(e, snowml_exceptions.SnowflakeMLException):
                     # already handled via a nested decorated function
-                    if hasattr(e, "_snowflake_ml_handled") and e._snowflake_ml_handled:
+                    if getattr(e, "_snowflake_ml_handled", False):
                         raise e
                     if isinstance(e, snowpark_exceptions.SnowparkClientException):
                         me = snowml_exceptions.SnowflakeMLException(
@@ -438,8 +562,6 @@ def send_api_usage_telemetry(
                     raise me.original_exception from None
                 else:
                     raise me.original_exception from e
-            else:
-                return update_stmt_params_if_snowpark_df(res, statement_params)
             finally:
                 telemetry.send_function_usage_telemetry(**telemetry_args)
                 global _log_counter

snowflake/ml/_internal/utils/identifier.py CHANGED Viewed

@@ -10,9 +10,11 @@ SF_QUOTED_IDENTIFIER = '"(?:[^"]|"")*"'
 _SF_IDENTIFIER = f"({_SF_UNQUOTED_CASE_INSENSITIVE_IDENTIFIER}|{SF_QUOTED_IDENTIFIER})"
 SF_IDENTIFIER_RE = re.compile(_SF_IDENTIFIER)
 _SF_SCHEMA_LEVEL_OBJECT = (
-    rf"(?:(?:(?P<db>{_SF_IDENTIFIER})\.)?(?P<schema>{_SF_IDENTIFIER})\.)?(?P<object>{_SF_IDENTIFIER})(?P<others>.*)"
+    rf"(?:(?:(?P<db>{_SF_IDENTIFIER})\.)?(?P<schema>{_SF_IDENTIFIER})\.)?(?P<object>{_SF_IDENTIFIER})"
 )
+_SF_STAGE_PATH = rf"{_SF_SCHEMA_LEVEL_OBJECT}(?P<path>.*)"
 _SF_SCHEMA_LEVEL_OBJECT_RE = re.compile(_SF_SCHEMA_LEVEL_OBJECT)
+_SF_STAGE_PATH_RE = re.compile(_SF_STAGE_PATH)
 UNQUOTED_CASE_INSENSITIVE_RE = re.compile(f"^({_SF_UNQUOTED_CASE_INSENSITIVE_IDENTIFIER})$")
 UNQUOTED_CASE_SENSITIVE_RE = re.compile(f"^({_SF_UNQUOTED_CASE_SENSITIVE_IDENTIFIER})$")
@@ -139,29 +141,61 @@ def rename_to_valid_snowflake_identifier(name: str) -> str:
 def parse_schema_level_object_identifier(
+    object_name: str,
+) -> Tuple[Union[str, Any], Union[str, Any], Union[str, Any]]:
+    """Parse a string which starts with schema level object.
+    Args:
+        object_name: A string starts with a schema level object path, which is in the format
+            '<db>.<schema>.<object_name>'. Here, '<db>', '<schema>' and '<object_name>' are all snowflake identifiers.
+    Returns:
+        A tuple of 3 strings in the form of (db, schema, object_name).
+    Raises:
+        ValueError: If the id is invalid.
+    """
+    res = _SF_SCHEMA_LEVEL_OBJECT_RE.fullmatch(object_name)
+    if not res:
+        raise ValueError(
+            "Invalid identifier because it does not follow the pattern. "
+            f"It should start with [[database.]schema.]object. Getting {object_name}"
+        )
+    return (
+        res.group("db"),
+        res.group("schema"),
+        res.group("object"),
+    )
+def parse_snowflake_stage_path(
     path: str,
 ) -> Tuple[Union[str, Any], Union[str, Any], Union[str, Any], Union[str, Any]]:
-    """Parse a string which starts with schema level object.
+    """Parse a string which represents a snowflake stage path.
     Args:
-        path: A string starts with a schema level object path, which is in the format '<db>.<schema>.<object_name>'.
-            Here, '<db>', '<schema>' and '<object_name>' are all snowflake identifiers.
+        path: A string starts with a schema level object path, which is in the format
+            '<db>.<schema>.<object_name><path>'. Here, '<db>', '<schema>' and '<object_name>' are all snowflake
+            identifiers.
     Returns:
-        A tuple of 4 strings in the form of (db, schema, object_name, others). 'db', 'schema', 'object_name' are parsed
-            from the schema level object and 'others' are all the content post to the object.
+        A tuple of 4 strings in the form of (db, schema, object_name, path). 'db', 'schema', 'object_name' are parsed
+            from the schema level object and 'path' are all the content post to the object.
     Raises:
         ValueError: If the id is invalid.
     """
-    res = _SF_SCHEMA_LEVEL_OBJECT_RE.fullmatch(path)
+    res = _SF_STAGE_PATH_RE.fullmatch(path)
     if not res:
-        raise ValueError(f"Invalid identifier. It should start with database.schema.object. Getting {path}")
+        raise ValueError(
+            "Invalid identifier because it does not follow the pattern. "
+            f"It should start with [[database.]schema.]object. Getting {path}"
+        )
     return (
         res.group("db"),
         res.group("schema"),
         res.group("object"),
-        res.group("others"),
+        res.group("path"),
     )
@@ -175,8 +209,11 @@ def is_fully_qualified_name(name: str) -> bool:
     Returns:
         bool: True if the name is fully qualified, False otherwise.
     """
-    res = parse_schema_level_object_identifier(name)
-    return res[0] is not None and res[1] is not None and res[2] is not None and not res[3]
+    try:
+        res = parse_schema_level_object_identifier(name)
+        return all(res)
+    except ValueError:
+        return False
 def get_schema_level_object_identifier(

snowflake/ml/_internal/utils/snowflake_env.py CHANGED Viewed

@@ -2,7 +2,7 @@ import enum
 from typing import Any, Dict, Optional, TypedDict, cast
 from packaging import version
-from typing_extensions import Required
+from typing_extensions import NotRequired, Required
 from snowflake.ml._internal.utils import query_result_checker
 from snowflake.snowpark import session
@@ -52,7 +52,7 @@ class SnowflakeCloudType(enum.Enum):
 class SnowflakeRegion(TypedDict):
-    region_group: Required[str]
+    region_group: NotRequired[str]
     snowflake_region: Required[str]
     cloud: Required[SnowflakeCloudType]
     region: Required[str]
@@ -64,23 +64,33 @@ def get_regions(
 ) -> Dict[str, SnowflakeRegion]:
     res = (
         query_result_checker.SqlResultValidator(sess, "SHOW REGIONS", statement_params=statement_params)
-        .has_column("region_group")
         .has_column("snowflake_region")
         .has_column("cloud")
         .has_column("region")
         .has_column("display_name")
         .validate()
     )
-    return {
-        f"{r.region_group}.{r.snowflake_region}": SnowflakeRegion(
-            region_group=r.region_group,
-            snowflake_region=r.snowflake_region,
-            cloud=SnowflakeCloudType.from_value(r.cloud),
-            region=r.region,
-            display_name=r.display_name,
-        )
-        for r in res
-    }
+    res_dict = {}
+    for r in res:
+        if hasattr(r, "region_group") and r.region_group:
+            key = f"{r.region_group}.{r.snowflake_region}"
+            res_dict[key] = SnowflakeRegion(
+                region_group=r.region_group,
+                snowflake_region=r.snowflake_region,
+                cloud=SnowflakeCloudType.from_value(r.cloud),
+                region=r.region,
+                display_name=r.display_name,
+            )
+        else:
+            key = r.snowflake_region
+            res_dict[key] = SnowflakeRegion(
+                snowflake_region=r.snowflake_region,
+                cloud=SnowflakeCloudType.from_value(r.cloud),
+                region=r.region,
+                display_name=r.display_name,
+            )
+    return res_dict
 def get_current_region_id(sess: session.Session, *, statement_params: Optional[Dict[str, Any]] = None) -> str:

snowflake/ml/_internal/utils/sql_identifier.py CHANGED Viewed

@@ -84,7 +84,7 @@ def to_sql_identifiers(list_of_str: List[str], *, case_sensitive: bool = False)
 def parse_fully_qualified_name(
     name: str,
 ) -> Tuple[Optional[SqlIdentifier], Optional[SqlIdentifier], SqlIdentifier]:
-    db, schema, object, _ = identifier.parse_schema_level_object_identifier(name)
+    db, schema, object = identifier.parse_schema_level_object_identifier(name)
     assert name is not None, f"Unable parse the input name `{name}` as fully qualified."
     return (

snowflake/ml/_internal/utils/table_manager.py CHANGED Viewed

@@ -1,7 +1,8 @@
 from typing import Any, Dict, List, Optional, Tuple
 from snowflake import snowpark
-from snowflake.ml._internal.utils import formatting, query_result_checker
+from snowflake.ml._internal.utils import formatting, identifier, query_result_checker
+from snowflake.snowpark import types
 """Table_manager is a set of utils that helps create tables.
@@ -104,3 +105,20 @@ def get_table_schema(session: snowpark.Session, table_name: str, qualified_schem
     for row in result:
         schema_dict[row["name"]] = row["type"]
     return schema_dict
+def get_table_schema_types(
+    session: snowpark.Session,
+    database: str,
+    schema: str,
+    table_name: str,
+) -> Dict[str, types.DataType]:
+    fully_qualified_table_name = identifier.get_schema_level_object_identifier(
+        db=database, schema=schema, object_name=table_name
+    )
+    struct_fields: List[types.StructField] = session.table(fully_qualified_table_name).schema.fields
+    schema_dict: Dict[str, types.DataType] = {}
+    for field in struct_fields:
+        schema_dict[field.name] = field.datatype
+    return schema_dict

snowflake/ml/_internal/utils/uri.py CHANGED Viewed

@@ -53,7 +53,7 @@ def get_uri_scheme(uri: str) -> str:
 def get_uri_from_snowflake_stage_path(stage_path: str) -> str:
     """Generates a URI from Snowflake stage path."""
     assert stage_path.startswith("@")
-    (db, schema, stage, path) = identifier.parse_schema_level_object_identifier(
+    (db, schema, stage, path) = identifier.parse_snowflake_stage_path(
         posixpath.normpath(identifier.remove_prefix(stage_path, "@"))
     )
     return urlunparse(
@@ -70,7 +70,7 @@ def get_uri_from_snowflake_stage_path(stage_path: str) -> str:
 def get_stage_and_path(stage_path: str) -> Tuple[str, str]:
     assert stage_path.startswith("@"), f"stage path should start with @, actual: {stage_path}"
-    (db, schema, stage, path) = identifier.parse_schema_level_object_identifier(
+    (db, schema, stage, path) = identifier.parse_snowflake_stage_path(
         posixpath.normpath(identifier.remove_prefix(stage_path, "@"))
     )
     full_qualified_stage = "@" + identifier.get_schema_level_object_identifier(db, schema, stage)

snowflake/ml/data/data_connector.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import os
 from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional, Type, TypeVar
 import numpy.typing as npt
@@ -7,6 +8,10 @@ from snowflake import snowpark
 from snowflake.ml._internal import telemetry
 from snowflake.ml.data import data_ingestor, data_source
 from snowflake.ml.data._internal.arrow_ingestor import ArrowIngestor
+from snowflake.ml.modeling._internal.constants import (
+    IN_ML_RUNTIME_ENV_VAR,
+    USE_OPTIMIZED_DATA_INGESTOR,
+)
 if TYPE_CHECKING:
     import pandas as pd
@@ -142,32 +147,41 @@ class DataConnector:
         Returns:
             A Pytorch iterable datapipe that yield data.
         """
-        from torch.utils.data.datapipes import iter as torch_iter
+        from snowflake.ml.data import torch_utils
-        return torch_iter.IterableWrapper(  # type: ignore[no-untyped-call]
-            self._ingestor.to_batches(batch_size, shuffle, drop_last_batch)
+        return torch_utils.TorchDataPipeWrapper(
+            self._ingestor, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last_batch
         )
     @telemetry.send_api_usage_telemetry(
         project=_PROJECT,
         subproject_extractor=lambda self: type(self).__name__,
-        func_params_to_log=["shuffle"],
+        func_params_to_log=["batch_size", "shuffle", "drop_last_batch"],
     )
-    def to_torch_dataset(self, *, shuffle: bool = False) -> "torch_data.IterableDataset":  # type: ignore[type-arg]
+    def to_torch_dataset(
+        self, *, batch_size: int = 1, shuffle: bool = False, drop_last_batch: bool = True
+    ) -> "torch_data.IterableDataset":  # type: ignore[type-arg]
         """Transform the Snowflake data into a PyTorch Iterable Dataset to be used with a DataLoader.
         Return a PyTorch Dataset which iterates on rows of data.
         Args:
+            batch_size: It specifies the size of each data batch which will be yielded in the result dataset.
+                Batching is pushed down to data ingestion level which may be more performant than DataLoader
+                batching.
             shuffle: It specifies whether the data will be shuffled. If True, files will be shuffled, and
                 rows in each file will also be shuffled.
+            drop_last_batch: Whether the last batch of data should be dropped. If set to be true,
+                then the last batch will get dropped if its size is smaller than the given batch_size.
         Returns:
             A PyTorch Iterable Dataset that yields data.
         """
-        from snowflake.ml.data import torch_dataset
+        from snowflake.ml.data import torch_utils
-        return torch_dataset.TorchDataset(self._ingestor, shuffle)
+        return torch_utils.TorchDatasetWrapper(
+            self._ingestor, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last_batch
+        )
     @telemetry.send_api_usage_telemetry(
         project=_PROJECT,
@@ -184,3 +198,15 @@ class DataConnector:
             A Pandas DataFrame.
         """
         return self._ingestor.to_pandas(limit)
+# Switch to use Runtime's Data Ingester if running in ML runtime
+# Fail silently if the data ingester is not found
+if os.getenv(IN_ML_RUNTIME_ENV_VAR) and os.getenv(USE_OPTIMIZED_DATA_INGESTOR):
+    try:
+        from runtime_external_entities import get_ingester_class
+        DataConnector.DEFAULT_INGESTOR_CLASS = get_ingester_class()
+    except ImportError:
+        """Runtime Default Ingester not found, ignore"""
+        pass

snowflake/ml/data/torch_utils.py ADDED Viewed

@@ -0,0 +1,68 @@
+from typing import Any, Dict, Iterator, List, Union
+import numpy as np
+import numpy.typing as npt
+import torch.utils.data
+from snowflake.ml.data import data_ingestor
+class TorchDatasetWrapper(torch.utils.data.IterableDataset[Dict[str, Any]]):
+    """Wrap a DataIngestor into a PyTorch IterableDataset"""
+    def __init__(
+        self,
+        ingestor: data_ingestor.DataIngestor,
+        *,
+        batch_size: int,
+        shuffle: bool = False,
+        drop_last: bool = False,
+        squeeze_outputs: bool = True
+    ) -> None:
+        """Not intended for direct usage. Use DataConnector.to_torch_dataset() instead"""
+        self._ingestor = ingestor
+        self._batch_size = batch_size
+        self._shuffle = shuffle
+        self._drop_last = drop_last
+        self._squeeze_outputs = squeeze_outputs
+    def __iter__(self) -> Iterator[Dict[str, Union[npt.NDArray[Any], List[Any]]]]:
+        max_idx = 0
+        filter_idx = 0
+        worker_info = torch.utils.data.get_worker_info()
+        if worker_info is not None:
+            max_idx = worker_info.num_workers - 1
+            filter_idx = worker_info.id
+        if self._shuffle and worker_info is not None:
+            raise RuntimeError("Dataset shuffling not currently supported with multithreading")
+        counter = 0
+        for batch in self._ingestor.to_batches(
+            batch_size=self._batch_size, shuffle=self._shuffle, drop_last_batch=self._drop_last
+        ):
+            # Skip indices during multi-process data loading to prevent data duplication
+            if counter == filter_idx:
+                # Basic preprocessing on batch values: squeeze away extra dimensions
+                # and convert object arrays (e.g. strings) to lists
+                if self._squeeze_outputs:
+                    yield {
+                        k: (v.squeeze().tolist() if v.dtype == np.object_ else v.squeeze()) for k, v in batch.items()
+                    }
+                else:
+                    yield batch  # type: ignore[misc]
+            if counter < max_idx:
+                counter += 1
+            else:
+                counter = 0
+class TorchDataPipeWrapper(TorchDatasetWrapper, torch.utils.data.IterDataPipe[Dict[str, Any]]):
+    """Wrap a DataIngestor into a PyTorch IterDataPipe"""
+    def __init__(
+        self, ingestor: data_ingestor.DataIngestor, *, batch_size: int, shuffle: bool = False, drop_last: bool = False
+    ) -> None:
+        """Not intended for direct usage. Use DataConnector.to_torch_datapipe() instead"""
+        super().__init__(ingestor, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, squeeze_outputs=False)

snowflake/ml/dataset/dataset.py CHANGED Viewed

@@ -472,9 +472,7 @@ lineage_node.DOMAIN_LINEAGE_REGISTRY["dataset"] = Dataset
 def _get_schema_level_identifier(session: snowpark.Session, dataset_name: str) -> Tuple[str, str, str]:
     """Resolve a dataset name into a validated schema-level location identifier"""
-    db, schema, object_name, others = identifier.parse_schema_level_object_identifier(dataset_name)
-    if others:
-        raise ValueError(f"Invalid identifier: unexpected '{others}'")
+    db, schema, object_name = identifier.parse_schema_level_object_identifier(dataset_name)
     db = db or session.get_current_database()
     schema = schema or session.get_current_schema()
     return str(db), str(schema), str(object_name)

snowflake-ml-python 1.6.1__py3-none-any.whl → 1.6.2__py3-none-any.whl

snowflake-ml-python 1.6.1py3-none-any.whl → 1.6.2py3-none-any.whl