snowflake-ml-python 1.4.1__py3-none-any.whl → 1.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (218) hide show
  1. snowflake/ml/_internal/env_utils.py +72 -31
  2. snowflake/ml/_internal/exceptions/dataset_error_messages.py +5 -0
  3. snowflake/ml/_internal/exceptions/dataset_errors.py +24 -0
  4. snowflake/ml/_internal/exceptions/error_codes.py +3 -0
  5. snowflake/ml/_internal/lineage/data_source.py +10 -0
  6. snowflake/ml/_internal/lineage/lineage_utils.py +95 -0
  7. snowflake/ml/_internal/telemetry.py +1 -0
  8. snowflake/ml/_internal/utils/identifier.py +1 -1
  9. snowflake/ml/_internal/utils/sql_identifier.py +14 -1
  10. snowflake/ml/dataset/__init__.py +11 -0
  11. snowflake/ml/dataset/dataset.py +455 -129
  12. snowflake/ml/dataset/dataset_factory.py +53 -0
  13. snowflake/ml/dataset/dataset_metadata.py +103 -0
  14. snowflake/ml/dataset/dataset_reader.py +199 -0
  15. snowflake/ml/feature_store/__init__.py +6 -0
  16. snowflake/ml/feature_store/access_manager.py +279 -0
  17. snowflake/ml/feature_store/feature_store.py +544 -358
  18. snowflake/ml/feature_store/feature_view.py +55 -16
  19. snowflake/ml/fileset/embedded_stage_fs.py +149 -0
  20. snowflake/ml/fileset/sfcfs.py +0 -4
  21. snowflake/ml/fileset/snowfs.py +160 -0
  22. snowflake/ml/fileset/stage_fs.py +25 -10
  23. snowflake/ml/model/__init__.py +2 -2
  24. snowflake/ml/model/_api.py +16 -1
  25. snowflake/ml/model/_client/model/model_impl.py +65 -31
  26. snowflake/ml/model/_client/model/model_version_impl.py +159 -2
  27. snowflake/ml/model/_client/ops/metadata_ops.py +27 -4
  28. snowflake/ml/model/_client/ops/model_ops.py +268 -83
  29. snowflake/ml/model/_client/sql/_base.py +34 -0
  30. snowflake/ml/model/_client/sql/model.py +42 -47
  31. snowflake/ml/model/_client/sql/model_version.py +164 -39
  32. snowflake/ml/model/_client/sql/stage.py +6 -32
  33. snowflake/ml/model/_client/sql/tag.py +32 -56
  34. snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +5 -1
  35. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +1 -0
  36. snowflake/ml/model/_deploy_client/snowservice/deploy.py +2 -0
  37. snowflake/ml/model/_deploy_client/utils/constants.py +0 -5
  38. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +21 -50
  39. snowflake/ml/model/_model_composer/model_composer.py +22 -1
  40. snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +22 -0
  41. snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +11 -0
  42. snowflake/ml/model/_packager/model_env/model_env.py +41 -0
  43. snowflake/ml/model/_packager/model_handlers/mlflow.py +2 -1
  44. snowflake/ml/model/_packager/model_meta/model_meta.py +1 -5
  45. snowflake/ml/model/_packager/model_packager.py +0 -3
  46. snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +55 -3
  47. snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +34 -18
  48. snowflake/ml/modeling/_internal/model_trainer.py +7 -0
  49. snowflake/ml/modeling/_internal/model_trainer_builder.py +42 -9
  50. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +50 -21
  51. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +24 -2
  52. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +340 -17
  53. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +51 -52
  54. snowflake/ml/modeling/cluster/affinity_propagation.py +51 -52
  55. snowflake/ml/modeling/cluster/agglomerative_clustering.py +51 -52
  56. snowflake/ml/modeling/cluster/birch.py +53 -52
  57. snowflake/ml/modeling/cluster/bisecting_k_means.py +53 -52
  58. snowflake/ml/modeling/cluster/dbscan.py +51 -52
  59. snowflake/ml/modeling/cluster/feature_agglomeration.py +53 -52
  60. snowflake/ml/modeling/cluster/k_means.py +53 -52
  61. snowflake/ml/modeling/cluster/mean_shift.py +51 -52
  62. snowflake/ml/modeling/cluster/mini_batch_k_means.py +53 -52
  63. snowflake/ml/modeling/cluster/optics.py +51 -52
  64. snowflake/ml/modeling/cluster/spectral_biclustering.py +51 -52
  65. snowflake/ml/modeling/cluster/spectral_clustering.py +51 -52
  66. snowflake/ml/modeling/cluster/spectral_coclustering.py +51 -52
  67. snowflake/ml/modeling/compose/column_transformer.py +53 -52
  68. snowflake/ml/modeling/compose/transformed_target_regressor.py +51 -52
  69. snowflake/ml/modeling/covariance/elliptic_envelope.py +51 -52
  70. snowflake/ml/modeling/covariance/empirical_covariance.py +51 -52
  71. snowflake/ml/modeling/covariance/graphical_lasso.py +51 -52
  72. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +51 -52
  73. snowflake/ml/modeling/covariance/ledoit_wolf.py +51 -52
  74. snowflake/ml/modeling/covariance/min_cov_det.py +51 -52
  75. snowflake/ml/modeling/covariance/oas.py +51 -52
  76. snowflake/ml/modeling/covariance/shrunk_covariance.py +51 -52
  77. snowflake/ml/modeling/decomposition/dictionary_learning.py +53 -52
  78. snowflake/ml/modeling/decomposition/factor_analysis.py +53 -52
  79. snowflake/ml/modeling/decomposition/fast_ica.py +53 -52
  80. snowflake/ml/modeling/decomposition/incremental_pca.py +53 -52
  81. snowflake/ml/modeling/decomposition/kernel_pca.py +53 -52
  82. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +53 -52
  83. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +53 -52
  84. snowflake/ml/modeling/decomposition/pca.py +53 -52
  85. snowflake/ml/modeling/decomposition/sparse_pca.py +53 -52
  86. snowflake/ml/modeling/decomposition/truncated_svd.py +53 -52
  87. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +53 -52
  88. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +51 -52
  89. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +51 -52
  90. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +51 -52
  91. snowflake/ml/modeling/ensemble/bagging_classifier.py +51 -52
  92. snowflake/ml/modeling/ensemble/bagging_regressor.py +51 -52
  93. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +51 -52
  94. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +51 -52
  95. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +51 -52
  96. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +51 -52
  97. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +51 -52
  98. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +51 -52
  99. snowflake/ml/modeling/ensemble/isolation_forest.py +51 -52
  100. snowflake/ml/modeling/ensemble/random_forest_classifier.py +51 -52
  101. snowflake/ml/modeling/ensemble/random_forest_regressor.py +51 -52
  102. snowflake/ml/modeling/ensemble/stacking_regressor.py +53 -52
  103. snowflake/ml/modeling/ensemble/voting_classifier.py +53 -52
  104. snowflake/ml/modeling/ensemble/voting_regressor.py +53 -52
  105. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +53 -52
  106. snowflake/ml/modeling/feature_selection/select_fdr.py +53 -52
  107. snowflake/ml/modeling/feature_selection/select_fpr.py +53 -52
  108. snowflake/ml/modeling/feature_selection/select_fwe.py +53 -52
  109. snowflake/ml/modeling/feature_selection/select_k_best.py +53 -52
  110. snowflake/ml/modeling/feature_selection/select_percentile.py +53 -52
  111. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +53 -52
  112. snowflake/ml/modeling/feature_selection/variance_threshold.py +53 -52
  113. snowflake/ml/modeling/framework/base.py +64 -36
  114. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +51 -52
  115. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +51 -52
  116. snowflake/ml/modeling/impute/iterative_imputer.py +53 -52
  117. snowflake/ml/modeling/impute/knn_imputer.py +53 -52
  118. snowflake/ml/modeling/impute/missing_indicator.py +53 -52
  119. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +53 -52
  120. snowflake/ml/modeling/kernel_approximation/nystroem.py +53 -52
  121. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +53 -52
  122. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +53 -52
  123. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +53 -52
  124. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +51 -52
  125. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +51 -52
  126. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +51 -52
  127. snowflake/ml/modeling/linear_model/ard_regression.py +51 -52
  128. snowflake/ml/modeling/linear_model/bayesian_ridge.py +51 -52
  129. snowflake/ml/modeling/linear_model/elastic_net.py +51 -52
  130. snowflake/ml/modeling/linear_model/elastic_net_cv.py +51 -52
  131. snowflake/ml/modeling/linear_model/gamma_regressor.py +51 -52
  132. snowflake/ml/modeling/linear_model/huber_regressor.py +51 -52
  133. snowflake/ml/modeling/linear_model/lars.py +51 -52
  134. snowflake/ml/modeling/linear_model/lars_cv.py +51 -52
  135. snowflake/ml/modeling/linear_model/lasso.py +51 -52
  136. snowflake/ml/modeling/linear_model/lasso_cv.py +51 -52
  137. snowflake/ml/modeling/linear_model/lasso_lars.py +51 -52
  138. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +51 -52
  139. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +51 -52
  140. snowflake/ml/modeling/linear_model/linear_regression.py +51 -52
  141. snowflake/ml/modeling/linear_model/logistic_regression.py +51 -52
  142. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +51 -52
  143. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +51 -52
  144. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +51 -52
  145. snowflake/ml/modeling/linear_model/multi_task_lasso.py +51 -52
  146. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +51 -52
  147. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +51 -52
  148. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +51 -52
  149. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +51 -52
  150. snowflake/ml/modeling/linear_model/perceptron.py +51 -52
  151. snowflake/ml/modeling/linear_model/poisson_regressor.py +51 -52
  152. snowflake/ml/modeling/linear_model/ransac_regressor.py +51 -52
  153. snowflake/ml/modeling/linear_model/ridge.py +51 -52
  154. snowflake/ml/modeling/linear_model/ridge_classifier.py +51 -52
  155. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +51 -52
  156. snowflake/ml/modeling/linear_model/ridge_cv.py +51 -52
  157. snowflake/ml/modeling/linear_model/sgd_classifier.py +51 -52
  158. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +51 -52
  159. snowflake/ml/modeling/linear_model/sgd_regressor.py +51 -52
  160. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +51 -52
  161. snowflake/ml/modeling/linear_model/tweedie_regressor.py +51 -52
  162. snowflake/ml/modeling/manifold/isomap.py +53 -52
  163. snowflake/ml/modeling/manifold/mds.py +53 -52
  164. snowflake/ml/modeling/manifold/spectral_embedding.py +53 -52
  165. snowflake/ml/modeling/manifold/tsne.py +53 -52
  166. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +51 -52
  167. snowflake/ml/modeling/mixture/gaussian_mixture.py +51 -52
  168. snowflake/ml/modeling/model_selection/grid_search_cv.py +21 -23
  169. snowflake/ml/modeling/model_selection/randomized_search_cv.py +38 -20
  170. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +51 -52
  171. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +51 -52
  172. snowflake/ml/modeling/multiclass/output_code_classifier.py +51 -52
  173. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +51 -52
  174. snowflake/ml/modeling/naive_bayes/categorical_nb.py +51 -52
  175. snowflake/ml/modeling/naive_bayes/complement_nb.py +51 -52
  176. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +51 -52
  177. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +51 -52
  178. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +51 -52
  179. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +51 -52
  180. snowflake/ml/modeling/neighbors/kernel_density.py +51 -52
  181. snowflake/ml/modeling/neighbors/local_outlier_factor.py +51 -52
  182. snowflake/ml/modeling/neighbors/nearest_centroid.py +51 -52
  183. snowflake/ml/modeling/neighbors/nearest_neighbors.py +51 -52
  184. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +53 -52
  185. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +51 -52
  186. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +51 -52
  187. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +53 -52
  188. snowflake/ml/modeling/neural_network/mlp_classifier.py +51 -52
  189. snowflake/ml/modeling/neural_network/mlp_regressor.py +51 -52
  190. snowflake/ml/modeling/pipeline/pipeline.py +538 -36
  191. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +12 -0
  192. snowflake/ml/modeling/preprocessing/polynomial_features.py +53 -52
  193. snowflake/ml/modeling/semi_supervised/label_propagation.py +51 -52
  194. snowflake/ml/modeling/semi_supervised/label_spreading.py +51 -52
  195. snowflake/ml/modeling/svm/linear_svc.py +51 -52
  196. snowflake/ml/modeling/svm/linear_svr.py +51 -52
  197. snowflake/ml/modeling/svm/nu_svc.py +51 -52
  198. snowflake/ml/modeling/svm/nu_svr.py +51 -52
  199. snowflake/ml/modeling/svm/svc.py +51 -52
  200. snowflake/ml/modeling/svm/svr.py +51 -52
  201. snowflake/ml/modeling/tree/decision_tree_classifier.py +51 -52
  202. snowflake/ml/modeling/tree/decision_tree_regressor.py +51 -52
  203. snowflake/ml/modeling/tree/extra_tree_classifier.py +51 -52
  204. snowflake/ml/modeling/tree/extra_tree_regressor.py +51 -52
  205. snowflake/ml/modeling/xgboost/xgb_classifier.py +51 -52
  206. snowflake/ml/modeling/xgboost/xgb_regressor.py +51 -52
  207. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +51 -52
  208. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +51 -52
  209. snowflake/ml/registry/_manager/model_manager.py +36 -7
  210. snowflake/ml/registry/model_registry.py +3 -149
  211. snowflake/ml/version.py +1 -1
  212. {snowflake_ml_python-1.4.1.dist-info → snowflake_ml_python-1.5.1.dist-info}/METADATA +112 -7
  213. {snowflake_ml_python-1.4.1.dist-info → snowflake_ml_python-1.5.1.dist-info}/RECORD +216 -206
  214. snowflake/ml/registry/_artifact_manager.py +0 -156
  215. snowflake/ml/registry/artifact.py +0 -46
  216. {snowflake_ml_python-1.4.1.dist-info → snowflake_ml_python-1.5.1.dist-info}/LICENSE.txt +0 -0
  217. {snowflake_ml_python-1.4.1.dist-info → snowflake_ml_python-1.5.1.dist-info}/WHEEL +0 -0
  218. {snowflake_ml_python-1.4.1.dist-info → snowflake_ml_python-1.5.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,103 @@
1
+ import dataclasses
2
+ import json
3
+ import typing
4
+ from typing import Any, Dict, List, Optional, Union
5
+
6
+ _PROPERTY_TYPE_KEY = "$proptype$"
7
+ DATASET_SCHEMA_VERSION = "1"
8
+
9
+
10
+ @dataclasses.dataclass(frozen=True)
11
+ class FeatureStoreMetadata:
12
+ """
13
+ Feature store metadata.
14
+
15
+ Properties:
16
+ spine_query: The input query on source table which will be joined with features.
17
+ serialized_feature_views: A list of serialized feature objects in the feature store.
18
+ spine_timestamp_col: Timestamp column which was used for point-in-time correct feature lookup.
19
+ """
20
+
21
+ spine_query: str
22
+ serialized_feature_views: List[str]
23
+ spine_timestamp_col: Optional[str] = None
24
+
25
+ def to_json(self) -> str:
26
+ return json.dumps(dataclasses.asdict(self))
27
+
28
+ @classmethod
29
+ def from_json(cls, input_json: Union[Dict[str, Any], str, bytes]) -> "FeatureStoreMetadata":
30
+ if isinstance(input_json, dict):
31
+ return cls(**input_json)
32
+ return cls(**json.loads(input_json))
33
+
34
+
35
+ DatasetPropertiesType = Union[
36
+ FeatureStoreMetadata,
37
+ ]
38
+
39
+ # Union[T] gets automatically squashed to T, so default to [T] if get_args() returns empty
40
+ _DatasetPropTypes = typing.get_args(DatasetPropertiesType) or [DatasetPropertiesType]
41
+ _DatasetPropTypeDict = {t.__name__: t for t in _DatasetPropTypes}
42
+
43
+
44
+ @dataclasses.dataclass(frozen=True)
45
+ class DatasetMetadata:
46
+ """
47
+ Dataset metadata.
48
+
49
+ Properties:
50
+ source_query: The query string used to produce the Dataset.
51
+ owner: The owner of the Dataset.
52
+ generation_timestamp: The timestamp when this dataset was generated.
53
+ exclude_cols: Name of column(s) in dataset to be excluded during training/testing.
54
+ These are typically columns for human inspection such as timestamp or other meta-information.
55
+ Columns included in `label_cols` do not need to be included here.
56
+ label_cols: Name of column(s) in dataset that contains labels.
57
+ properties: Additional metadata properties.
58
+ """
59
+
60
+ source_query: str
61
+ owner: str
62
+ exclude_cols: Optional[List[str]] = None
63
+ label_cols: Optional[List[str]] = None
64
+ properties: Optional[DatasetPropertiesType] = None
65
+ schema_version: str = dataclasses.field(default=DATASET_SCHEMA_VERSION, init=False)
66
+
67
+ def to_json(self) -> str:
68
+ state_dict = dataclasses.asdict(self)
69
+ if self.properties:
70
+ prop_type = type(self.properties).__name__
71
+ if prop_type not in _DatasetPropTypeDict:
72
+ raise ValueError(
73
+ f"Unsupported `properties` type={prop_type} (supported={','.join(_DatasetPropTypeDict.keys())})"
74
+ )
75
+ state_dict[_PROPERTY_TYPE_KEY] = prop_type
76
+ return json.dumps(state_dict)
77
+
78
+ @classmethod
79
+ def from_json(cls, input_json: Union[Dict[str, Any], str, bytes]) -> "DatasetMetadata":
80
+ if not input_json:
81
+ raise ValueError("json_str was empty or None")
82
+ try:
83
+ state_dict: Dict[str, Any] = (
84
+ input_json if isinstance(input_json, dict) else json.loads(input_json, strict=False)
85
+ )
86
+
87
+ # TODO: Validate schema version
88
+ _ = state_dict.pop("schema_version", DATASET_SCHEMA_VERSION)
89
+
90
+ prop_type = state_dict.pop(_PROPERTY_TYPE_KEY, None)
91
+ prop_values = state_dict.get("properties", {})
92
+ if prop_type:
93
+ prop_cls = _DatasetPropTypeDict.get(prop_type, None)
94
+ if prop_cls is None:
95
+ raise TypeError(
96
+ f"Unsupported `properties` type={prop_type} (supported={','.join(_DatasetPropTypeDict.keys())})"
97
+ )
98
+ state_dict["properties"] = prop_cls(**prop_values)
99
+ elif prop_values:
100
+ raise TypeError(f"`properties` provided but missing `{_PROPERTY_TYPE_KEY}`")
101
+ return cls(**state_dict)
102
+ except TypeError as e:
103
+ raise ValueError("Invalid input schema") from e
@@ -0,0 +1,199 @@
1
+ from typing import Any, List
2
+
3
+ import pandas as pd
4
+ from pyarrow import parquet as pq
5
+
6
+ from snowflake import snowpark
7
+ from snowflake.ml._internal import telemetry
8
+ from snowflake.ml._internal.lineage import data_source, lineage_utils
9
+ from snowflake.ml._internal.utils import import_utils
10
+ from snowflake.ml.fileset import snowfs
11
+
12
+ _PROJECT = "Dataset"
13
+ _SUBPROJECT = "DatasetReader"
14
+ TARGET_FILE_SIZE = 32 * 2**20 # The max file size for data loading.
15
+
16
+
17
+ class DatasetReader:
18
+ """Snowflake Dataset abstraction which provides application integration connectors"""
19
+
20
+ @telemetry.send_api_usage_telemetry(project=_PROJECT, subproject=_SUBPROJECT)
21
+ def __init__(
22
+ self,
23
+ session: snowpark.Session,
24
+ sources: List[data_source.DataSource],
25
+ ) -> None:
26
+ """Initialize a DatasetVersion object.
27
+
28
+ Args:
29
+ session: Snowpark Session to interact with Snowflake backend.
30
+ sources: Data sources to read from.
31
+
32
+ Raises:
33
+ ValueError: `sources` arg was empty or null
34
+ """
35
+ if not sources:
36
+ raise ValueError("Invalid input: empty `sources` list not allowed")
37
+ self._session = session
38
+ self._sources = sources
39
+ self._fs: snowfs.SnowFileSystem = snowfs.SnowFileSystem(
40
+ snowpark_session=self._session,
41
+ cache_type="bytes",
42
+ block_size=2 * TARGET_FILE_SIZE,
43
+ )
44
+
45
+ self._files: List[str] = []
46
+
47
+ def _list_files(self) -> List[str]:
48
+ """Private helper function that lists all files in this DatasetVersion and caches the results."""
49
+ if self._files:
50
+ return self._files
51
+
52
+ files: List[str] = []
53
+ for source in self._sources:
54
+ # Sort within each source for consistent ordering
55
+ files.extend(sorted(self._fs.ls(source.url))) # type: ignore[arg-type]
56
+ files.sort()
57
+
58
+ self._files = files
59
+ return self._files
60
+
61
+ @property
62
+ def data_sources(self) -> List[data_source.DataSource]:
63
+ return self._sources
64
+
65
+ @telemetry.send_api_usage_telemetry(project=_PROJECT, subproject=_SUBPROJECT)
66
+ def files(self) -> List[str]:
67
+ """Get the list of remote file paths for the current DatasetVersion.
68
+
69
+ The file paths follows the snow protocol.
70
+
71
+ Returns:
72
+ A list of remote file paths
73
+
74
+ Example:
75
+ >>> dsv.files()
76
+ ----
77
+ ["snow://dataset/mydb.myschema.mydataset/versions/test/data_0_0_0.snappy.parquet",
78
+ "snow://dataset/mydb.myschema.mydataset/versions/test/data_0_0_1.snappy.parquet"]
79
+ """
80
+ files = self._list_files()
81
+ return [self._fs.unstrip_protocol(f) for f in files]
82
+
83
+ @telemetry.send_api_usage_telemetry(project=_PROJECT, subproject=_SUBPROJECT)
84
+ def filesystem(self) -> snowfs.SnowFileSystem:
85
+ """Return an fsspec FileSystem which can be used to load the DatasetVersion's `files()`"""
86
+ return self._fs
87
+
88
+ @telemetry.send_api_usage_telemetry(
89
+ project=_PROJECT,
90
+ subproject=_SUBPROJECT,
91
+ func_params_to_log=["batch_size", "shuffle", "drop_last_batch"],
92
+ )
93
+ def to_torch_datapipe(self, *, batch_size: int, shuffle: bool = False, drop_last_batch: bool = True) -> Any:
94
+ """Transform the Snowflake data into a ready-to-use Pytorch datapipe.
95
+
96
+ Return a Pytorch datapipe which iterates on rows of data.
97
+
98
+ Args:
99
+ batch_size: It specifies the size of each data batch which will be
100
+ yield in the result datapipe
101
+ shuffle: It specifies whether the data will be shuffled. If True, files will be shuffled, and
102
+ rows in each file will also be shuffled.
103
+ drop_last_batch: Whether the last batch of data should be dropped. If set to be true,
104
+ then the last batch will get dropped if its size is smaller than the given batch_size.
105
+
106
+ Returns:
107
+ A Pytorch iterable datapipe that yield data.
108
+
109
+ Examples:
110
+ >>> dp = dataset.to_torch_datapipe(batch_size=1)
111
+ >>> for data in dp:
112
+ >>> print(data)
113
+ ----
114
+ {'_COL_1':[10]}
115
+ """
116
+ IterableWrapper, _ = import_utils.import_or_get_dummy("torchdata.datapipes.iter.IterableWrapper")
117
+ torch_datapipe_module, _ = import_utils.import_or_get_dummy("snowflake.ml.fileset.torch_datapipe")
118
+
119
+ self._fs.optimize_read(self._list_files())
120
+
121
+ input_dp = IterableWrapper(self._list_files())
122
+ return torch_datapipe_module.ReadAndParseParquet(input_dp, self._fs, batch_size, shuffle, drop_last_batch)
123
+
124
+ @telemetry.send_api_usage_telemetry(
125
+ project=_PROJECT,
126
+ subproject=_SUBPROJECT,
127
+ func_params_to_log=["batch_size", "shuffle", "drop_last_batch"],
128
+ )
129
+ def to_tf_dataset(self, *, batch_size: int, shuffle: bool = False, drop_last_batch: bool = True) -> Any:
130
+ """Transform the Snowflake data into a ready-to-use TensorFlow tf.data.Dataset.
131
+
132
+ Args:
133
+ batch_size: It specifies the size of each data batch which will be
134
+ yield in the result datapipe
135
+ shuffle: It specifies whether the data will be shuffled. If True, files will be shuffled, and
136
+ rows in each file will also be shuffled.
137
+ drop_last_batch: Whether the last batch of data should be dropped. If set to be true,
138
+ then the last batch will get dropped if its size is smaller than the given batch_size.
139
+
140
+ Returns:
141
+ A tf.data.Dataset that yields batched tf.Tensors.
142
+
143
+ Examples:
144
+ >>> dp = dataset.to_tf_dataset(batch_size=1)
145
+ >>> for data in dp:
146
+ >>> print(data)
147
+ ----
148
+ {'_COL_1': <tf.Tensor: shape=(1,), dtype=int64, numpy=[10]>}
149
+ """
150
+ tf_dataset_module, _ = import_utils.import_or_get_dummy("snowflake.ml.fileset.tf_dataset")
151
+
152
+ self._fs.optimize_read(self._list_files())
153
+
154
+ return tf_dataset_module.read_and_parse_parquet(
155
+ self._list_files(), self._fs, batch_size, shuffle, drop_last_batch
156
+ )
157
+
158
+ @telemetry.send_api_usage_telemetry(
159
+ project=_PROJECT,
160
+ subproject=_SUBPROJECT,
161
+ func_params_to_log=["only_feature_cols"],
162
+ )
163
+ def to_snowpark_dataframe(self, only_feature_cols: bool = False) -> snowpark.DataFrame:
164
+ """Convert the DatasetVersion to a Snowpark DataFrame.
165
+
166
+ Args:
167
+ only_feature_cols: If True, drops exclude_cols and label_cols from returned DataFrame.
168
+ The original DatasetVersion is unaffected.
169
+
170
+ Returns:
171
+ A Snowpark dataframe that contains the data of this DatasetVersion.
172
+
173
+ Note: The dataframe generated by this method might not have the same schema as the original one. Specifically,
174
+ - NUMBER type with scale != 0 will become float.
175
+ - Unsupported types (see comments of :func:`Dataset.create_version`) will not have any guarantee.
176
+ For example, an OBJECT column may be scanned back as a STRING column.
177
+ """
178
+ file_path_pattern = ".*data_.*[.]parquet"
179
+ dfs: List[snowpark.DataFrame] = []
180
+ for source in self._sources:
181
+ df = self._session.read.option("pattern", file_path_pattern).parquet(source.url)
182
+ if only_feature_cols and source.exclude_cols:
183
+ df = df.drop(source.exclude_cols)
184
+ dfs.append(df)
185
+
186
+ combined_df = dfs[0]
187
+ for df in dfs[1:]:
188
+ combined_df = combined_df.union_all_by_name(df)
189
+ return lineage_utils.patch_dataframe(combined_df, data_sources=self._sources, inplace=True)
190
+
191
+ @telemetry.send_api_usage_telemetry(project=_PROJECT, subproject=_SUBPROJECT)
192
+ def to_pandas(self) -> pd.DataFrame:
193
+ """Retrieve the DatasetVersion contents as a Pandas Dataframe"""
194
+ files = self._list_files()
195
+ if not files:
196
+ return pd.DataFrame() # Return empty DataFrame
197
+ self._fs.optimize_read(files)
198
+ pd_ds = pq.ParquetDataset(files, filesystem=self._fs)
199
+ return pd_ds.read_pandas().to_pandas()
@@ -2,8 +2,14 @@ import os
2
2
 
3
3
  from snowflake.ml._internal import init_utils
4
4
 
5
+ from .access_manager import setup_feature_store
6
+
5
7
  pkg_dir = os.path.dirname(os.path.abspath(__file__))
6
8
  pkg_name = __name__
7
9
  exportable_classes = init_utils.fetch_classes_from_modules_in_pkg_dir(pkg_dir=pkg_dir, pkg_name=pkg_name)
8
10
  for k, v in exportable_classes.items():
9
11
  globals()[k] = v
12
+
13
+ __all__ = list(exportable_classes.keys()) + [
14
+ "setup_feature_store",
15
+ ]
@@ -0,0 +1,279 @@
1
+ from dataclasses import asdict, dataclass
2
+ from enum import Enum
3
+ from typing import Dict, List, Optional
4
+ from warnings import warn
5
+
6
+ from snowflake.ml._internal import telemetry
7
+ from snowflake.ml._internal.utils.query_result_checker import SqlResultValidator
8
+ from snowflake.ml._internal.utils.sql_identifier import SqlIdentifier
9
+ from snowflake.ml.feature_store.feature_store import (
10
+ _FEATURE_STORE_OBJECT_TAG,
11
+ _FEATURE_VIEW_METADATA_TAG,
12
+ CreationMode,
13
+ FeatureStore,
14
+ )
15
+ from snowflake.snowpark import Session, exceptions
16
+
17
+ _PROJECT = "FeatureStore"
18
+ _ALL_OBJECTS = "@ALL_OBJECTS" # Special flag to mark "all+future" grants
19
+
20
+
21
+ class _FeatureStoreRole(Enum):
22
+ NONE = 0 # For testing purposes
23
+ CONSUMER = 1
24
+ PRODUCER = 2
25
+
26
+
27
+ @dataclass(frozen=True)
28
+ class _Privilege:
29
+ object_type: str
30
+ object_name: str
31
+ privileges: List[str]
32
+ scope: Optional[str] = None
33
+
34
+
35
+ @dataclass(frozen=True)
36
+ class _SessionInfo:
37
+ database: SqlIdentifier
38
+ schema: SqlIdentifier
39
+ warehouse: SqlIdentifier
40
+
41
+
42
+ # Lists of permissions as tuples of (OBJECT_TYPE, [PRIVILEGES, ...])
43
+ _PRE_INIT_PRIVILEGES: Dict[_FeatureStoreRole, List[_Privilege]] = {
44
+ _FeatureStoreRole.PRODUCER: [
45
+ _Privilege(
46
+ "SCHEMA",
47
+ "{database}.{schema}",
48
+ [
49
+ "CREATE DYNAMIC TABLE",
50
+ "CREATE TAG",
51
+ "CREATE VIEW",
52
+ "CREATE TASK",
53
+ "CREATE TABLE",
54
+ ],
55
+ ),
56
+ _Privilege(
57
+ "SCHEMA",
58
+ "{database}.{schema}",
59
+ [
60
+ "CREATE DATASET", # Handle DATASET privilege separately since it may not be enabled
61
+ ],
62
+ ),
63
+ _Privilege("DYNAMIC TABLE", _ALL_OBJECTS, ["OPERATE"], "SCHEMA {database}.{schema}"),
64
+ _Privilege("TASK", _ALL_OBJECTS, ["OPERATE"], "SCHEMA {database}.{schema}"),
65
+ ],
66
+ _FeatureStoreRole.CONSUMER: [
67
+ _Privilege("DATABASE", "{database}", ["USAGE"]),
68
+ _Privilege("SCHEMA", "{database}.{schema}", ["USAGE"]),
69
+ _Privilege("DYNAMIC TABLE", _ALL_OBJECTS, ["SELECT", "MONITOR"], "SCHEMA {database}.{schema}"),
70
+ _Privilege("VIEW", _ALL_OBJECTS, ["SELECT", "REFERENCES"], "SCHEMA {database}.{schema}"),
71
+ _Privilege("TABLE", _ALL_OBJECTS, ["SELECT", "REFERENCES"], "SCHEMA {database}.{schema}"),
72
+ # FIXME(dhung): FUTURE DATASETS not supported until 8.19
73
+ # _Privilege("DATASET", _ALL_OBJECTS, ["USAGE"], "SCHEMA {database}.{schema}"),
74
+ # User should decide whether they want to grant warehouse usage to CONSUMER
75
+ # _Privilege("WAREHOUSE", "{warehouse}", ["USAGE"]),
76
+ ],
77
+ _FeatureStoreRole.NONE: [],
78
+ }
79
+
80
+ _POST_INIT_PRIVILEGES: Dict[_FeatureStoreRole, List[_Privilege]] = {
81
+ _FeatureStoreRole.PRODUCER: [
82
+ _Privilege("TAG", f"{{database}}.{{schema}}.{_FEATURE_VIEW_METADATA_TAG}", ["APPLY"]),
83
+ _Privilege("TAG", f"{{database}}.{{schema}}.{_FEATURE_STORE_OBJECT_TAG}", ["APPLY"]),
84
+ ],
85
+ _FeatureStoreRole.CONSUMER: [],
86
+ _FeatureStoreRole.NONE: [],
87
+ }
88
+
89
+
90
+ def _grant_privileges(
91
+ session: Session, role_name: str, privileges: List[_Privilege], session_info: _SessionInfo
92
+ ) -> None:
93
+ session_info_dict = asdict(session_info)
94
+ for p in privileges:
95
+ if p.object_name == _ALL_OBJECTS:
96
+ # Ensure obj is plural
97
+ obj = p.object_type.upper()
98
+ if not obj.endswith("S"):
99
+ obj += "S"
100
+ grant_objects = [f"{prefix} {obj}" for prefix in ("FUTURE", "ALL")]
101
+ else:
102
+ grant_objects = [f"{p.object_type} {p.object_name.format(**session_info_dict)}"]
103
+ try:
104
+ for grant_object in grant_objects:
105
+ query = f"GRANT {','.join(p.privileges)} ON {grant_object}"
106
+ if p.scope:
107
+ query += f" IN {p.scope.format(**session_info_dict)}"
108
+ query += f" TO ROLE {role_name}"
109
+ session.sql(query).collect()
110
+ except exceptions.SnowparkSQLException as e:
111
+ if any(
112
+ s in e.message
113
+ for s in (
114
+ "Ask your account admin",
115
+ "Object type or Class",
116
+ p.object_type,
117
+ )
118
+ ):
119
+ warn(
120
+ f"Failed to grant privilege for {p.object_type}: {e.message}",
121
+ UserWarning,
122
+ stacklevel=1,
123
+ )
124
+ else:
125
+ raise
126
+
127
+
128
+ def _configure_pre_init_privileges(
129
+ session: Session,
130
+ session_info: _SessionInfo,
131
+ producer_role: str = "SNOWML_FEATURE_STORE_PRODUCER_RL",
132
+ consumer_role: str = "SNOWML_FEATURE_STORE_CONSUMER_RL",
133
+ ) -> None:
134
+ """
135
+ Configure Feature Store role privileges. Must be run with ACCOUNTADMIN
136
+ or a role with `MANAGE GRANTS` privilege.
137
+
138
+ See https://docs.snowflake.com/en/sql-reference/sql/grant-privilege for more information
139
+ about privilege grants in Snowflake.
140
+
141
+ Args:
142
+ session: Snowpark Session to interact with Snowflake backend.
143
+ session_info: Session info like database and schema for the FeatureStore instance.
144
+ producer_role: Name of producer role to be configured.
145
+ consumer_role: Name of consumer role to be configured.
146
+ """
147
+
148
+ # Create schema if not already exists
149
+ (create_rst,) = (
150
+ SqlResultValidator(
151
+ session,
152
+ f"CREATE SCHEMA IF NOT EXISTS {session_info.database}.{session_info.schema}",
153
+ )
154
+ .has_dimensions(expected_rows=1)
155
+ .has_column("status")
156
+ .validate()
157
+ )
158
+ schema_created = create_rst["status"].endswith("successfully created.")
159
+
160
+ # Pass schema ownership from admin to PRODUCER
161
+ if schema_created:
162
+ session.sql(
163
+ f"GRANT OWNERSHIP ON SCHEMA {session_info.database}.{session_info.schema} TO ROLE {producer_role}"
164
+ ).collect()
165
+
166
+ # Grant privileges to roles
167
+ _grant_privileges(session, producer_role, _PRE_INIT_PRIVILEGES[_FeatureStoreRole.PRODUCER], session_info)
168
+ _grant_privileges(session, consumer_role, _PRE_INIT_PRIVILEGES[_FeatureStoreRole.CONSUMER], session_info)
169
+
170
+
171
+ def _configure_post_init_privileges(
172
+ session: Session,
173
+ session_info: _SessionInfo,
174
+ producer_role: str = "FS_PRODUCER",
175
+ consumer_role: str = "FS_CONSUMER",
176
+ ) -> None:
177
+ _grant_privileges(session, producer_role, _POST_INIT_PRIVILEGES[_FeatureStoreRole.PRODUCER], session_info)
178
+ _grant_privileges(session, consumer_role, _POST_INIT_PRIVILEGES[_FeatureStoreRole.CONSUMER], session_info)
179
+
180
+
181
+ def _configure_role_hierarchy(
182
+ session: Session,
183
+ producer_role: str,
184
+ consumer_role: str,
185
+ ) -> None:
186
+ """
187
+ Create Feature Store roles and configure role hierarchy hierarchy. Must be run with
188
+ ACCOUNTADMIN or a role with `CREATE ROLE` privilege.
189
+
190
+ See https://docs.snowflake.com/en/sql-reference/sql/grant-privilege for more information
191
+ about privilege grants in Snowflake.
192
+
193
+ Args:
194
+ session: Snowpark Session to interact with Snowflake backend.
195
+ producer_role: Name of producer role to be configured.
196
+ consumer_role: Name of consumer role to be configured.
197
+ """
198
+ producer_role = SqlIdentifier(producer_role)
199
+ consumer_role = SqlIdentifier(consumer_role)
200
+
201
+ # Create the necessary roles
202
+ session.sql(f"CREATE ROLE IF NOT EXISTS {producer_role}").collect()
203
+ session.sql(f"CREATE ROLE IF NOT EXISTS {consumer_role}").collect()
204
+
205
+ # Build role hierarchy
206
+ session.sql(f"GRANT ROLE {consumer_role} TO ROLE {producer_role}").collect()
207
+ session.sql(f"GRANT ROLE {producer_role} TO ROLE SYSADMIN").collect()
208
+ session.sql(f"GRANT ROLE {producer_role} TO ROLE {session.get_current_role()}").collect()
209
+
210
+
211
+ @telemetry.send_api_usage_telemetry(project=_PROJECT)
212
+ def setup_feature_store(
213
+ session: Session,
214
+ database: str,
215
+ schema: str,
216
+ warehouse: str,
217
+ producer_role: str = "FS_PRODUCER",
218
+ consumer_role: str = "FS_CONSUMER",
219
+ ) -> FeatureStore:
220
+ """
221
+ Sets up a new Feature Store including role/privilege setup. Must be run with ACCOUNTADMIN
222
+ or a role with `MANAGE GRANTS` and `CREATE ROLE` privileges.
223
+
224
+ See https://docs.snowflake.com/en/sql-reference/sql/grant-privilege for more information
225
+ about privilege grants in Snowflake.
226
+
227
+ Args:
228
+ session: Snowpark Session to interact with Snowflake backend.
229
+ database: Database to create the FeatureStore instance.
230
+ schema: Schema to create the FeatureStore instance.
231
+ warehouse: Default warehouse for Feature Store compute.
232
+ producer_role: Name of producer role to be configured.
233
+ consumer_role: Name of consumer role to be configured.
234
+
235
+ Returns:
236
+ Feature Store instance.
237
+
238
+ Raises:
239
+ exceptions.SnowparkSQLException: Insufficient privileges.
240
+ """
241
+
242
+ database = SqlIdentifier(database)
243
+ schema = SqlIdentifier(schema)
244
+ warehouse = SqlIdentifier(warehouse)
245
+ session_info = _SessionInfo(
246
+ SqlIdentifier(database),
247
+ SqlIdentifier(schema),
248
+ SqlIdentifier(warehouse),
249
+ )
250
+
251
+ try:
252
+ _configure_role_hierarchy(session, producer_role=producer_role, consumer_role=consumer_role)
253
+ except exceptions.SnowparkSQLException:
254
+ # Error can be safely ignored if roles already exist and hierarchy is already built
255
+ for role in (producer_role, consumer_role):
256
+ # Ensure roles already exist
257
+ if session.sql(f"SHOW ROLES LIKE '{role}' STARTS WITH '{role}'").count() == 0:
258
+ raise
259
+ # Ensure hierarchy already configured
260
+ consumer_grants = session.sql(f"SHOW GRANTS ON ROLE {consumer_role}").collect()
261
+ if not any(r["granted_to"] == "ROLE" and r["grantee_name"] == producer_role for r in consumer_grants):
262
+ raise
263
+
264
+ # Do any pre-FeatureStore.__init__() privilege setup
265
+ _configure_pre_init_privileges(session, session_info, producer_role, consumer_role)
266
+
267
+ # Use PRODUCER role to create and operate new Feature Store
268
+ current_role = session.get_current_role()
269
+ assert current_role is not None # to make mypy happy
270
+ try:
271
+ session.use_role(producer_role)
272
+ fs = FeatureStore(session, database, schema, warehouse, creation_mode=CreationMode.CREATE_IF_NOT_EXIST)
273
+ finally:
274
+ session.use_role(current_role)
275
+
276
+ # Do any post-FeatureStore.__init__() privilege setup
277
+ _configure_post_init_privileges(session, session_info, producer_role, consumer_role)
278
+
279
+ return fs