snowflake-ml-python 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (234) hide show
  1. snowflake/ml/_internal/env_utils.py +77 -32
  2. snowflake/ml/_internal/exceptions/dataset_error_messages.py +5 -0
  3. snowflake/ml/_internal/exceptions/dataset_errors.py +24 -0
  4. snowflake/ml/_internal/exceptions/error_codes.py +3 -0
  5. snowflake/ml/_internal/lineage/data_source.py +10 -0
  6. snowflake/ml/_internal/lineage/dataset_dataframe.py +44 -0
  7. snowflake/ml/_internal/utils/identifier.py +3 -1
  8. snowflake/ml/_internal/utils/sql_identifier.py +2 -6
  9. snowflake/ml/dataset/__init__.py +10 -0
  10. snowflake/ml/dataset/dataset.py +454 -129
  11. snowflake/ml/dataset/dataset_factory.py +53 -0
  12. snowflake/ml/dataset/dataset_metadata.py +103 -0
  13. snowflake/ml/dataset/dataset_reader.py +202 -0
  14. snowflake/ml/feature_store/feature_store.py +531 -332
  15. snowflake/ml/feature_store/feature_view.py +40 -23
  16. snowflake/ml/fileset/embedded_stage_fs.py +146 -0
  17. snowflake/ml/fileset/sfcfs.py +56 -54
  18. snowflake/ml/fileset/snowfs.py +159 -0
  19. snowflake/ml/fileset/stage_fs.py +49 -17
  20. snowflake/ml/model/__init__.py +2 -2
  21. snowflake/ml/model/_api.py +16 -1
  22. snowflake/ml/model/_client/model/model_impl.py +27 -0
  23. snowflake/ml/model/_client/model/model_version_impl.py +137 -50
  24. snowflake/ml/model/_client/ops/model_ops.py +159 -40
  25. snowflake/ml/model/_client/sql/model.py +25 -2
  26. snowflake/ml/model/_client/sql/model_version.py +131 -2
  27. snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +5 -1
  28. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +1 -0
  29. snowflake/ml/model/_deploy_client/snowservice/deploy.py +2 -0
  30. snowflake/ml/model/_deploy_client/utils/constants.py +0 -5
  31. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +21 -50
  32. snowflake/ml/model/_model_composer/model_composer.py +22 -1
  33. snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +38 -51
  34. snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +19 -1
  35. snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
  36. snowflake/ml/model/_packager/model_env/model_env.py +41 -0
  37. snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
  38. snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
  39. snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
  40. snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
  41. snowflake/ml/model/_packager/model_meta/model_meta.py +37 -11
  42. snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
  43. snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
  44. snowflake/ml/model/_packager/model_packager.py +2 -5
  45. snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
  46. snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
  47. snowflake/ml/model/type_hints.py +21 -2
  48. snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
  49. snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
  50. snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +55 -3
  51. snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +34 -18
  52. snowflake/ml/modeling/_internal/model_trainer.py +7 -0
  53. snowflake/ml/modeling/_internal/model_trainer_builder.py +42 -9
  54. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
  55. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +29 -7
  56. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +261 -16
  57. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +246 -175
  58. snowflake/ml/modeling/cluster/affinity_propagation.py +246 -175
  59. snowflake/ml/modeling/cluster/agglomerative_clustering.py +246 -175
  60. snowflake/ml/modeling/cluster/birch.py +248 -175
  61. snowflake/ml/modeling/cluster/bisecting_k_means.py +248 -175
  62. snowflake/ml/modeling/cluster/dbscan.py +246 -175
  63. snowflake/ml/modeling/cluster/feature_agglomeration.py +248 -175
  64. snowflake/ml/modeling/cluster/k_means.py +248 -175
  65. snowflake/ml/modeling/cluster/mean_shift.py +246 -175
  66. snowflake/ml/modeling/cluster/mini_batch_k_means.py +248 -175
  67. snowflake/ml/modeling/cluster/optics.py +246 -175
  68. snowflake/ml/modeling/cluster/spectral_biclustering.py +246 -175
  69. snowflake/ml/modeling/cluster/spectral_clustering.py +246 -175
  70. snowflake/ml/modeling/cluster/spectral_coclustering.py +246 -175
  71. snowflake/ml/modeling/compose/column_transformer.py +248 -175
  72. snowflake/ml/modeling/compose/transformed_target_regressor.py +246 -175
  73. snowflake/ml/modeling/covariance/elliptic_envelope.py +246 -175
  74. snowflake/ml/modeling/covariance/empirical_covariance.py +246 -175
  75. snowflake/ml/modeling/covariance/graphical_lasso.py +246 -175
  76. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +246 -175
  77. snowflake/ml/modeling/covariance/ledoit_wolf.py +246 -175
  78. snowflake/ml/modeling/covariance/min_cov_det.py +246 -175
  79. snowflake/ml/modeling/covariance/oas.py +246 -175
  80. snowflake/ml/modeling/covariance/shrunk_covariance.py +246 -175
  81. snowflake/ml/modeling/decomposition/dictionary_learning.py +248 -175
  82. snowflake/ml/modeling/decomposition/factor_analysis.py +248 -175
  83. snowflake/ml/modeling/decomposition/fast_ica.py +248 -175
  84. snowflake/ml/modeling/decomposition/incremental_pca.py +248 -175
  85. snowflake/ml/modeling/decomposition/kernel_pca.py +248 -175
  86. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +248 -175
  87. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +248 -175
  88. snowflake/ml/modeling/decomposition/pca.py +248 -175
  89. snowflake/ml/modeling/decomposition/sparse_pca.py +248 -175
  90. snowflake/ml/modeling/decomposition/truncated_svd.py +248 -175
  91. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +248 -175
  92. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +246 -175
  93. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +246 -175
  94. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +246 -175
  95. snowflake/ml/modeling/ensemble/bagging_classifier.py +246 -175
  96. snowflake/ml/modeling/ensemble/bagging_regressor.py +246 -175
  97. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +246 -175
  98. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +246 -175
  99. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +246 -175
  100. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +246 -175
  101. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +246 -175
  102. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +246 -175
  103. snowflake/ml/modeling/ensemble/isolation_forest.py +246 -175
  104. snowflake/ml/modeling/ensemble/random_forest_classifier.py +246 -175
  105. snowflake/ml/modeling/ensemble/random_forest_regressor.py +246 -175
  106. snowflake/ml/modeling/ensemble/stacking_regressor.py +248 -175
  107. snowflake/ml/modeling/ensemble/voting_classifier.py +248 -175
  108. snowflake/ml/modeling/ensemble/voting_regressor.py +248 -175
  109. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +248 -175
  110. snowflake/ml/modeling/feature_selection/select_fdr.py +248 -175
  111. snowflake/ml/modeling/feature_selection/select_fpr.py +248 -175
  112. snowflake/ml/modeling/feature_selection/select_fwe.py +248 -175
  113. snowflake/ml/modeling/feature_selection/select_k_best.py +248 -175
  114. snowflake/ml/modeling/feature_selection/select_percentile.py +248 -175
  115. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +248 -175
  116. snowflake/ml/modeling/feature_selection/variance_threshold.py +248 -175
  117. snowflake/ml/modeling/framework/_utils.py +8 -1
  118. snowflake/ml/modeling/framework/base.py +72 -37
  119. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +246 -175
  120. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +246 -175
  121. snowflake/ml/modeling/impute/iterative_imputer.py +248 -175
  122. snowflake/ml/modeling/impute/knn_imputer.py +248 -175
  123. snowflake/ml/modeling/impute/missing_indicator.py +248 -175
  124. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +248 -175
  125. snowflake/ml/modeling/kernel_approximation/nystroem.py +248 -175
  126. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +248 -175
  127. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +248 -175
  128. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +248 -175
  129. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +246 -175
  130. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +246 -175
  131. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +246 -175
  132. snowflake/ml/modeling/linear_model/ard_regression.py +246 -175
  133. snowflake/ml/modeling/linear_model/bayesian_ridge.py +246 -175
  134. snowflake/ml/modeling/linear_model/elastic_net.py +246 -175
  135. snowflake/ml/modeling/linear_model/elastic_net_cv.py +246 -175
  136. snowflake/ml/modeling/linear_model/gamma_regressor.py +246 -175
  137. snowflake/ml/modeling/linear_model/huber_regressor.py +246 -175
  138. snowflake/ml/modeling/linear_model/lars.py +246 -175
  139. snowflake/ml/modeling/linear_model/lars_cv.py +246 -175
  140. snowflake/ml/modeling/linear_model/lasso.py +246 -175
  141. snowflake/ml/modeling/linear_model/lasso_cv.py +246 -175
  142. snowflake/ml/modeling/linear_model/lasso_lars.py +246 -175
  143. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +246 -175
  144. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +246 -175
  145. snowflake/ml/modeling/linear_model/linear_regression.py +246 -175
  146. snowflake/ml/modeling/linear_model/logistic_regression.py +246 -175
  147. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +246 -175
  148. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +246 -175
  149. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +246 -175
  150. snowflake/ml/modeling/linear_model/multi_task_lasso.py +246 -175
  151. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +246 -175
  152. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +246 -175
  153. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +246 -175
  154. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +246 -175
  155. snowflake/ml/modeling/linear_model/perceptron.py +246 -175
  156. snowflake/ml/modeling/linear_model/poisson_regressor.py +246 -175
  157. snowflake/ml/modeling/linear_model/ransac_regressor.py +246 -175
  158. snowflake/ml/modeling/linear_model/ridge.py +246 -175
  159. snowflake/ml/modeling/linear_model/ridge_classifier.py +246 -175
  160. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +246 -175
  161. snowflake/ml/modeling/linear_model/ridge_cv.py +246 -175
  162. snowflake/ml/modeling/linear_model/sgd_classifier.py +246 -175
  163. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +246 -175
  164. snowflake/ml/modeling/linear_model/sgd_regressor.py +246 -175
  165. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +246 -175
  166. snowflake/ml/modeling/linear_model/tweedie_regressor.py +246 -175
  167. snowflake/ml/modeling/manifold/isomap.py +248 -175
  168. snowflake/ml/modeling/manifold/mds.py +248 -175
  169. snowflake/ml/modeling/manifold/spectral_embedding.py +248 -175
  170. snowflake/ml/modeling/manifold/tsne.py +248 -175
  171. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +246 -175
  172. snowflake/ml/modeling/mixture/gaussian_mixture.py +246 -175
  173. snowflake/ml/modeling/model_selection/grid_search_cv.py +63 -41
  174. snowflake/ml/modeling/model_selection/randomized_search_cv.py +80 -38
  175. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +246 -175
  176. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +246 -175
  177. snowflake/ml/modeling/multiclass/output_code_classifier.py +246 -175
  178. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +246 -175
  179. snowflake/ml/modeling/naive_bayes/categorical_nb.py +246 -175
  180. snowflake/ml/modeling/naive_bayes/complement_nb.py +246 -175
  181. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +246 -175
  182. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +246 -175
  183. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +246 -175
  184. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +246 -175
  185. snowflake/ml/modeling/neighbors/kernel_density.py +246 -175
  186. snowflake/ml/modeling/neighbors/local_outlier_factor.py +246 -175
  187. snowflake/ml/modeling/neighbors/nearest_centroid.py +246 -175
  188. snowflake/ml/modeling/neighbors/nearest_neighbors.py +246 -175
  189. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +248 -175
  190. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +246 -175
  191. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +246 -175
  192. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +248 -175
  193. snowflake/ml/modeling/neural_network/mlp_classifier.py +246 -175
  194. snowflake/ml/modeling/neural_network/mlp_regressor.py +246 -175
  195. snowflake/ml/modeling/pipeline/pipeline.py +517 -35
  196. snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
  197. snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
  198. snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
  199. snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
  200. snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
  201. snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
  202. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +13 -5
  203. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
  204. snowflake/ml/modeling/preprocessing/polynomial_features.py +248 -175
  205. snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
  206. snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
  207. snowflake/ml/modeling/semi_supervised/label_propagation.py +246 -175
  208. snowflake/ml/modeling/semi_supervised/label_spreading.py +246 -175
  209. snowflake/ml/modeling/svm/linear_svc.py +246 -175
  210. snowflake/ml/modeling/svm/linear_svr.py +246 -175
  211. snowflake/ml/modeling/svm/nu_svc.py +246 -175
  212. snowflake/ml/modeling/svm/nu_svr.py +246 -175
  213. snowflake/ml/modeling/svm/svc.py +246 -175
  214. snowflake/ml/modeling/svm/svr.py +246 -175
  215. snowflake/ml/modeling/tree/decision_tree_classifier.py +246 -175
  216. snowflake/ml/modeling/tree/decision_tree_regressor.py +246 -175
  217. snowflake/ml/modeling/tree/extra_tree_classifier.py +246 -175
  218. snowflake/ml/modeling/tree/extra_tree_regressor.py +246 -175
  219. snowflake/ml/modeling/xgboost/xgb_classifier.py +246 -175
  220. snowflake/ml/modeling/xgboost/xgb_regressor.py +246 -175
  221. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +246 -175
  222. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +246 -175
  223. snowflake/ml/registry/model_registry.py +3 -149
  224. snowflake/ml/registry/registry.py +1 -1
  225. snowflake/ml/version.py +1 -1
  226. {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/METADATA +129 -57
  227. snowflake_ml_python-1.5.0.dist-info/RECORD +380 -0
  228. snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
  229. snowflake/ml/registry/_artifact_manager.py +0 -156
  230. snowflake/ml/registry/artifact.py +0 -46
  231. snowflake_ml_python-1.4.0.dist-info/RECORD +0 -370
  232. {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/LICENSE.txt +0 -0
  233. {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/WHEEL +0 -0
  234. {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.5.0.dist-info}/top_level.txt +0 -0
@@ -1,161 +1,486 @@
1
1
  import json
2
- import time
3
- from dataclasses import dataclass
4
- from typing import Any, Dict, List, Optional
2
+ import warnings
3
+ from datetime import datetime
4
+ from typing import Any, Dict, List, Optional, Tuple, Union
5
5
 
6
- from snowflake.ml.registry.artifact import Artifact, ArtifactType
7
- from snowflake.snowpark import DataFrame, Session
6
+ from snowflake import snowpark
7
+ from snowflake.ml._internal import telemetry
8
+ from snowflake.ml._internal.exceptions import (
9
+ dataset_error_messages,
10
+ dataset_errors,
11
+ error_codes,
12
+ exceptions as snowml_exceptions,
13
+ )
14
+ from snowflake.ml._internal.lineage import data_source
15
+ from snowflake.ml._internal.utils import (
16
+ formatting,
17
+ identifier,
18
+ query_result_checker,
19
+ snowpark_dataframe_utils,
20
+ )
21
+ from snowflake.ml.dataset import dataset_metadata, dataset_reader
22
+ from snowflake.snowpark import exceptions as snowpark_exceptions, functions
8
23
 
24
+ _PROJECT = "Dataset"
25
+ _TELEMETRY_STATEMENT_PARAMS = telemetry.get_function_usage_statement_params(_PROJECT)
26
+ _METADATA_MAX_QUERY_LENGTH = 10000
27
+ _DATASET_VERSION_NAME_COL = "version"
9
28
 
10
- def _get_val_or_null(val: Any) -> Any:
11
- return val if val is not None else "null"
12
29
 
30
+ class DatasetVersion:
31
+ """Represents a version of a Snowflake Dataset"""
13
32
 
14
- def _wrap_embedded_str(s: str) -> str:
15
- s = s.replace("\\", "\\\\")
16
- s = s.replace('"', '\\"')
17
- return s
33
+ @telemetry.send_api_usage_telemetry(project=_PROJECT)
34
+ def __init__(
35
+ self,
36
+ dataset: "Dataset",
37
+ version: str,
38
+ ) -> None:
39
+ """Initialize a DatasetVersion object.
18
40
 
41
+ Args:
42
+ dataset: The parent Snowflake Dataset.
43
+ version: Dataset version name.
44
+ """
45
+ self._parent = dataset
46
+ self._version = version
47
+ self._session: snowpark.Session = self._parent._session
19
48
 
20
- DATASET_SCHEMA_VERSION = "1"
49
+ self._properties: Optional[Dict[str, Any]] = None
50
+ self._raw_metadata: Optional[Dict[str, Any]] = None
51
+ self._metadata: Optional[dataset_metadata.DatasetMetadata] = None
21
52
 
53
+ @property
54
+ def name(self) -> str:
55
+ return self._version
22
56
 
23
- @dataclass(frozen=True)
24
- class FeatureStoreMetadata:
25
- """
26
- Feature store metadata.
57
+ @property
58
+ def created_on(self) -> datetime:
59
+ timestamp = self._get_property("created_on")
60
+ assert isinstance(timestamp, datetime)
61
+ return timestamp
27
62
 
28
- Properties:
29
- spine_query: The input query on source table which will be joined with features.
30
- connection_params: a config contains feature store metadata.
31
- features: A list of feature serialized object in the feature store.
63
+ @property
64
+ def comment(self) -> Optional[str]:
65
+ comment: Optional[str] = self._get_property("comment")
66
+ return comment
32
67
 
33
- """
68
+ def _get_property(self, property_name: str, default: Any = None) -> Any:
69
+ if self._properties is None:
70
+ sql_result = (
71
+ query_result_checker.SqlResultValidator(
72
+ self._session,
73
+ f"SHOW VERSIONS LIKE '{self._version}' IN DATASET {self._parent.fully_qualified_name}",
74
+ statement_params=_TELEMETRY_STATEMENT_PARAMS,
75
+ )
76
+ .has_dimensions(expected_rows=1)
77
+ .validate()
78
+ )
79
+ self._properties = sql_result[0].as_dict(True)
80
+ return self._properties.get(property_name, default)
81
+
82
+ def _get_metadata(self) -> Optional[dataset_metadata.DatasetMetadata]:
83
+ if self._raw_metadata is None:
84
+ self._raw_metadata = json.loads(self._get_property("metadata", "{}"))
85
+ try:
86
+ self._metadata = (
87
+ dataset_metadata.DatasetMetadata.from_json(self._raw_metadata) if self._raw_metadata else None
88
+ )
89
+ except ValueError as e:
90
+ warnings.warn(f"Metadata parsing failed with error: {e}", UserWarning, stacklevel=2)
91
+ return self._metadata
34
92
 
35
- spine_query: str
36
- connection_params: Dict[str, str]
37
- features: List[str]
93
+ def _get_exclude_cols(self) -> List[str]:
94
+ metadata = self._get_metadata()
95
+ if metadata is None:
96
+ return []
97
+ cols = []
98
+ if metadata.exclude_cols:
99
+ cols.extend(metadata.exclude_cols)
100
+ if metadata.label_cols:
101
+ cols.extend(metadata.label_cols)
102
+ return cols
38
103
 
39
- def to_json(self) -> str:
40
- state_dict = {
41
- # TODO(zhe): Additional wrap is needed because ml_.artifact.ad_artifact takes a dict
42
- # but we retrieve it as an object. Snowpark serialization is inconsistent with
43
- # our deserialization. A fix is let artifact table stores string and callers
44
- # handles both serialization and deserialization.
45
- "spine_query": self.spine_query,
46
- "connection_params": json.dumps(self.connection_params),
47
- "features": json.dumps(self.features),
48
- }
49
- return json.dumps(state_dict)
104
+ def url(self) -> str:
105
+ """Returns the URL of the DatasetVersion contents in Snowflake.
106
+
107
+ Returns:
108
+ Snowflake URL string.
109
+ """
110
+ path = f"snow://dataset/{self._parent.fully_qualified_name}/versions/{self._version}/"
111
+ return path
50
112
 
51
- @classmethod
52
- def from_json(cls, json_str: str) -> "FeatureStoreMetadata":
53
- json_dict = json.loads(json_str)
54
- return cls(
55
- spine_query=json_dict["spine_query"],
56
- connection_params=json.loads(json_dict["connection_params"]),
57
- features=json.loads(json_dict["features"]),
113
+ @telemetry.send_api_usage_telemetry(project=_PROJECT)
114
+ def list_files(self, subdir: Optional[str] = None) -> List[snowpark.Row]:
115
+ """Get the list of remote file paths for the current DatasetVersion."""
116
+ return self._session.sql(f"LIST {self.url()}{subdir or ''}").collect(
117
+ statement_params=_TELEMETRY_STATEMENT_PARAMS
58
118
  )
59
119
 
120
+ def __repr__(self) -> str:
121
+ return f"{self.__class__.__name__}(dataset='{self._parent.fully_qualified_name}', version='{self.name}')"
60
122
 
61
- class Dataset(Artifact):
62
- """Metadata of dataset."""
63
123
 
124
+ class Dataset:
125
+ """Represents a Snowflake Dataset which is organized into versions."""
126
+
127
+ @telemetry.send_api_usage_telemetry(project=_PROJECT)
64
128
  def __init__(
65
129
  self,
66
- session: Session,
67
- df: DataFrame,
68
- generation_timestamp: Optional[float] = None,
69
- materialized_table: Optional[str] = None,
70
- snapshot_table: Optional[str] = None,
71
- timestamp_col: Optional[str] = None,
72
- label_cols: Optional[List[str]] = None,
73
- feature_store_metadata: Optional[FeatureStoreMetadata] = None,
74
- desc: str = "",
130
+ session: snowpark.Session,
131
+ database: str,
132
+ schema: str,
133
+ name: str,
134
+ selected_version: Optional[str] = None,
75
135
  ) -> None:
76
- """Initialize dataset object.
136
+ """Initialize a lazily evaluated Dataset object"""
137
+ self._session = session
138
+ self._db = database
139
+ self._schema = schema
140
+ self._name = name
141
+ self._fully_qualified_name = identifier.get_schema_level_object_identifier(database, schema, name)
142
+
143
+ self._version = DatasetVersion(self, selected_version) if selected_version else None
144
+ self._reader: Optional[dataset_reader.DatasetReader] = None
145
+
146
+ @property
147
+ def fully_qualified_name(self) -> str:
148
+ return self._fully_qualified_name
149
+
150
+ @property
151
+ def selected_version(self) -> Optional[DatasetVersion]:
152
+ return self._version
153
+
154
+ @property
155
+ def read(self) -> dataset_reader.DatasetReader:
156
+ if not self.selected_version:
157
+ raise snowml_exceptions.SnowflakeMLException(
158
+ error_code=error_codes.INVALID_ATTRIBUTE,
159
+ original_exception=RuntimeError("No Dataset version selected."),
160
+ )
161
+ if self._reader is None:
162
+ v = self.selected_version
163
+ self._reader = dataset_reader.DatasetReader(
164
+ self._session,
165
+ [
166
+ data_source.DataSource(
167
+ fully_qualified_name=self._fully_qualified_name,
168
+ version=v.name,
169
+ url=v.url(),
170
+ exclude_cols=v._get_exclude_cols(),
171
+ )
172
+ ],
173
+ )
174
+ return self._reader
175
+
176
+ @staticmethod
177
+ @telemetry.send_api_usage_telemetry(project=_PROJECT)
178
+ def load(session: snowpark.Session, name: str) -> "Dataset":
179
+ """
180
+ Load an existing Snowflake Dataset. DatasetVersions can be created from the Dataset object
181
+ using `Dataset.create_version()` and loaded with `Dataset.version()`.
77
182
 
78
183
  Args:
79
- session: An active snowpark session.
80
- df: A dataframe object representing the dataset generation.
81
- generation_timestamp: The timestamp when this dataset is generated. It will use current time if
82
- not provided.
83
- materialized_table: The destination table name which data will writes into.
84
- snapshot_table: A snapshot table name on the materialized table.
85
- timestamp_col: Timestamp column which was used for point-in-time correct feature lookup.
86
- label_cols: Name of column(s) in materialized_table that contains labels.
87
- feature_store_metadata: A feature store metadata object.
88
- desc: A description about this dataset.
184
+ session: Snowpark Session to interact with Snowflake backend.
185
+ name: Name of dataset to load. May optionally be a schema-level identifier.
186
+
187
+ Returns:
188
+ Dataset object representing loaded dataset
189
+
190
+ Raises:
191
+ ValueError: name is not a valid Snowflake identifier
192
+ DatasetNotExistError: Specified Dataset does not exist
193
+
194
+ # noqa: DAR402
89
195
  """
90
- self.df = df
91
- self.generation_timestamp = generation_timestamp if generation_timestamp is not None else time.time()
92
- self.materialized_table = materialized_table
93
- self.snapshot_table = snapshot_table
94
- self.timestamp_col = timestamp_col
95
- self.label_cols = label_cols
96
- self.feature_store_metadata = feature_store_metadata
97
- self.desc = desc
98
- self.owner = session.sql("SELECT CURRENT_USER()").collect()[0]["CURRENT_USER()"]
99
- self.schema_version = DATASET_SCHEMA_VERSION
100
-
101
- super().__init__(type=ArtifactType.DATASET, spec=self.to_json())
102
-
103
- def load_features(self) -> Optional[List[str]]:
104
- if self.feature_store_metadata is not None:
105
- return self.feature_store_metadata.features
106
- else:
107
- return None
108
-
109
- def features_df(self) -> DataFrame:
110
- result = self.df
111
- if self.timestamp_col is not None:
112
- result = result.drop(self.timestamp_col)
113
- if self.label_cols is not None:
114
- result = result.drop(self.label_cols)
115
- return result
116
-
117
- def to_json(self) -> str:
118
- if len(self.df.queries["queries"]) != 1:
119
- raise ValueError(
120
- f"""df dataframe must contain only 1 query.
121
- Got {len(self.df.queries['queries'])}: {self.df.queries['queries']}
122
- """
196
+ db, schema, ds_name = _get_schema_level_identifier(session, name)
197
+ _validate_dataset_exists(session, db, schema, ds_name)
198
+ return Dataset(session, db, schema, ds_name)
199
+
200
+ @staticmethod
201
+ @telemetry.send_api_usage_telemetry(project=_PROJECT)
202
+ def create(session: snowpark.Session, name: str, exist_ok: bool = False) -> "Dataset":
203
+ """
204
+ Create a new Snowflake Dataset. DatasetVersions can created from the Dataset object
205
+ using `Dataset.create_version()` and loaded with `Dataset.version()`.
206
+
207
+ Args:
208
+ session: Snowpark Session to interact with Snowflake backend.
209
+ name: Name of dataset to create. May optionally be a schema-level identifier.
210
+ exist_ok: If False, raises an exception if specified Dataset already exists
211
+
212
+ Returns:
213
+ Dataset object representing created dataset
214
+
215
+ Raises:
216
+ ValueError: name is not a valid Snowflake identifier
217
+ DatasetExistError: Specified Dataset already exists
218
+ DatasetError: Dataset creation failed
219
+
220
+ # noqa: DAR401
221
+ # noqa: DAR402
222
+ """
223
+ db, schema, ds_name = _get_schema_level_identifier(session, name)
224
+ ds_fqn = identifier.get_schema_level_object_identifier(db, schema, ds_name)
225
+ query = f"CREATE DATASET{' IF NOT EXISTS' if exist_ok else ''} {ds_fqn}"
226
+ try:
227
+ session.sql(query).collect(statement_params=_TELEMETRY_STATEMENT_PARAMS)
228
+ return Dataset(session, db, schema, ds_name)
229
+ except snowpark_exceptions.SnowparkClientException as e:
230
+ # Snowpark wraps the Python Connector error code in the head of the error message.
231
+ if e.message.startswith(dataset_errors.ERRNO_OBJECT_ALREADY_EXISTS):
232
+ raise snowml_exceptions.SnowflakeMLException(
233
+ error_code=error_codes.OBJECT_ALREADY_EXISTS,
234
+ original_exception=dataset_errors.DatasetExistError(
235
+ dataset_error_messages.DATASET_ALREADY_EXISTS.format(name)
236
+ ),
237
+ ) from e
238
+ else:
239
+ raise
240
+
241
+ @telemetry.send_api_usage_telemetry(project=_PROJECT)
242
+ def list_versions(self, detailed: bool = False) -> Union[List[str], List[snowpark.Row]]:
243
+ """Return list of versions"""
244
+ versions = self._list_versions()
245
+ versions.sort(key=lambda r: r[_DATASET_VERSION_NAME_COL])
246
+ if not detailed:
247
+ return [r[_DATASET_VERSION_NAME_COL] for r in versions]
248
+ return versions
249
+
250
+ @telemetry.send_api_usage_telemetry(project=_PROJECT)
251
+ def select_version(self, version: str) -> "Dataset":
252
+ """Return a new Dataset instance with the specified version selected.
253
+
254
+ Args:
255
+ version: Dataset version name.
256
+
257
+ Returns:
258
+ Dataset object.
259
+ """
260
+ self._validate_version_exists(version)
261
+ return Dataset(self._session, self._db, self._schema, self._name, version)
262
+
263
+ @telemetry.send_api_usage_telemetry(project=_PROJECT)
264
+ def create_version(
265
+ self,
266
+ version: str,
267
+ input_dataframe: snowpark.DataFrame,
268
+ shuffle: bool = False,
269
+ exclude_cols: Optional[List[str]] = None,
270
+ label_cols: Optional[List[str]] = None,
271
+ properties: Optional[dataset_metadata.DatasetPropertiesType] = None,
272
+ partition_by: Optional[str] = None,
273
+ comment: Optional[str] = None,
274
+ ) -> "Dataset":
275
+ """Create a new version of the current Dataset.
276
+
277
+ The result Dataset object captures the query result deterministically as stage files.
278
+
279
+ Args:
280
+ version: Dataset version name. Data contents are materialized to the Dataset entity.
281
+ input_dataframe: A Snowpark DataFrame which yields the Dataset contents.
282
+ shuffle: A boolean represents whether the data should be shuffled globally. Default to be false.
283
+ exclude_cols: Name of column(s) in dataset to be excluded during training/testing (e.g. timestamp).
284
+ label_cols: Name of column(s) in dataset that contains labels.
285
+ properties: Custom metadata properties, saved under `DatasetMetadata.properties`
286
+ partition_by: Optional partitioning scheme within the new Dataset version.
287
+ comment: A descriptive comment about this dataset.
288
+
289
+ Returns:
290
+ A Dataset object with the newly created version selected.
291
+
292
+ Raises:
293
+ SnowflakeMLException: The Dataset no longer exists.
294
+ SnowflakeMLException: The specified Dataset version already exists.
295
+ snowpark_exceptions.SnowparkClientException: An error occurred during Dataset creation.
296
+
297
+ Note: During the generation of stage files, data casting will occur. The casting rules are as follows::
298
+ - Data casting:
299
+ - DecimalType(NUMBER):
300
+ - If its scale is zero, cast to BIGINT
301
+ - If its scale is non-zero, cast to FLOAT
302
+ - DoubleType(DOUBLE): Cast to FLOAT.
303
+ - ByteType(TINYINT): Cast to SMALLINT.
304
+ - ShortType(SMALLINT):Cast to SMALLINT.
305
+ - IntegerType(INT): Cast to INT.
306
+ - LongType(BIGINT): Cast to BIGINT.
307
+ - No action:
308
+ - FloatType(FLOAT): No action.
309
+ - StringType(String): No action.
310
+ - BinaryType(BINARY): No action.
311
+ - BooleanType(BOOLEAN): No action.
312
+ - Not supported:
313
+ - ArrayType(ARRAY): Not supported. A warning will be logged.
314
+ - MapType(OBJECT): Not supported. A warning will be logged.
315
+ - TimestampType(TIMESTAMP): Not supported. A warning will be logged.
316
+ - TimeType(TIME): Not supported. A warning will be logged.
317
+ - DateType(DATE): Not supported. A warning will be logged.
318
+ - VariantType(VARIANT): Not supported. A warning will be logged.
319
+ """
320
+ casted_df = snowpark_dataframe_utils.cast_snowpark_dataframe(input_dataframe)
321
+
322
+ if shuffle:
323
+ casted_df = casted_df.order_by(functions.random())
324
+
325
+ source_query = json.dumps(input_dataframe.queries)
326
+ if len(source_query) > _METADATA_MAX_QUERY_LENGTH:
327
+ warnings.warn(
328
+ "Source query exceeded max query length, dropping from metadata (limit=%d, actual=%d)"
329
+ % (_METADATA_MAX_QUERY_LENGTH, len(source_query)),
330
+ stacklevel=2,
123
331
  )
332
+ source_query = "<query too long>"
124
333
 
125
- state_dict = {
126
- "df_query": _wrap_embedded_str(self.df.queries["queries"][0]),
127
- "generation_timestamp": self.generation_timestamp,
128
- "owner": self.owner,
129
- "materialized_table": _wrap_embedded_str(_get_val_or_null(self.materialized_table)),
130
- "snapshot_table": _wrap_embedded_str(_get_val_or_null(self.snapshot_table)),
131
- "timestamp_col": _wrap_embedded_str(_get_val_or_null(self.timestamp_col)),
132
- "label_cols": _get_val_or_null(self.label_cols),
133
- "feature_store_metadata": _wrap_embedded_str(self.feature_store_metadata.to_json())
134
- if self.feature_store_metadata is not None
135
- else "null",
136
- "schema_version": self.schema_version,
137
- "desc": self.desc,
138
- }
139
- return json.dumps(state_dict)
140
-
141
- @classmethod
142
- def from_json(cls, json_str: str, session: Session) -> "Dataset":
143
- json_dict = json.loads(json_str, strict=False)
144
- json_dict["df"] = session.sql(json_dict.pop("df_query"))
145
-
146
- fs_meta_json = json_dict["feature_store_metadata"]
147
- json_dict["feature_store_metadata"] = (
148
- FeatureStoreMetadata.from_json(fs_meta_json) if fs_meta_json != "null" else None
334
+ metadata = dataset_metadata.DatasetMetadata(
335
+ source_query=source_query,
336
+ owner=self._session.sql("SELECT CURRENT_USER()").collect(statement_params=_TELEMETRY_STATEMENT_PARAMS)[0][
337
+ "CURRENT_USER()"
338
+ ],
339
+ exclude_cols=exclude_cols,
340
+ label_cols=label_cols,
341
+ properties=properties,
149
342
  )
150
343
 
151
- schema_version = json_dict.pop("schema_version")
152
- owner = json_dict.pop("owner")
344
+ post_actions = casted_df._plan.post_actions
345
+ try:
346
+ # Execute all but the last query, final query gets passed to ALTER DATASET ADD VERSION
347
+ query = casted_df._plan.queries[-1].sql.strip()
348
+ if len(casted_df._plan.queries) > 1:
349
+ casted_df._plan.queries = casted_df._plan.queries[:-1]
350
+ casted_df._plan.post_actions = []
351
+ casted_df.collect(statement_params=_TELEMETRY_STATEMENT_PARAMS)
352
+ sql_command = "ALTER DATASET {} ADD VERSION '{}' FROM ({})".format(
353
+ self.fully_qualified_name,
354
+ version,
355
+ query,
356
+ )
357
+ if partition_by:
358
+ sql_command += f" PARTITION BY {partition_by}"
359
+ if comment:
360
+ sql_command += f" COMMENT={formatting.format_value_for_select(comment)}"
361
+ sql_command += f" METADATA=$${metadata.to_json()}$$"
362
+ self._session.sql(sql_command).collect(statement_params=_TELEMETRY_STATEMENT_PARAMS)
363
+
364
+ return Dataset(self._session, self._db, self._schema, self._name, version)
153
365
 
154
- result = cls(session, **json_dict)
155
- result.schema_version = schema_version
156
- result.owner = owner
366
+ except snowpark_exceptions.SnowparkClientException as e:
367
+ if e.message.startswith(dataset_errors.ERRNO_DATASET_NOT_EXIST):
368
+ raise snowml_exceptions.SnowflakeMLException(
369
+ error_code=error_codes.NOT_FOUND,
370
+ original_exception=dataset_errors.DatasetNotExistError(
371
+ dataset_error_messages.DATASET_NOT_EXIST.format(self.fully_qualified_name)
372
+ ),
373
+ ) from e
374
+ elif (
375
+ e.message.startswith(dataset_errors.ERRNO_DATASET_VERSION_ALREADY_EXISTS)
376
+ or e.message.startswith(dataset_errors.ERRNO_VERSION_ALREADY_EXISTS)
377
+ or e.message.startswith(dataset_errors.ERRNO_FILES_ALREADY_EXISTING)
378
+ ):
379
+ raise snowml_exceptions.SnowflakeMLException(
380
+ error_code=error_codes.OBJECT_ALREADY_EXISTS,
381
+ original_exception=dataset_errors.DatasetExistError(
382
+ dataset_error_messages.DATASET_VERSION_ALREADY_EXISTS.format(self.fully_qualified_name, version)
383
+ ),
384
+ ) from e
385
+ else:
386
+ raise
387
+ finally:
388
+ for action in post_actions:
389
+ self._session.sql(action.sql.strip()).collect(statement_params=_TELEMETRY_STATEMENT_PARAMS)
157
390
 
158
- return result
391
+ @telemetry.send_api_usage_telemetry(project=_PROJECT)
392
+ def delete_version(self, version_name: str) -> None:
393
+ """Delete the Dataset version
159
394
 
160
- def __eq__(self, other: object) -> bool:
161
- return isinstance(other, Dataset) and self.to_json() == other.to_json()
395
+ Args:
396
+ version_name: Name of version to delete from Dataset
397
+
398
+ Raises:
399
+ SnowflakeMLException: An error occurred when the DatasetVersion cannot get deleted.
400
+ """
401
+ delete_sql = f"ALTER DATASET {self.fully_qualified_name} DROP VERSION '{version_name}'"
402
+ try:
403
+ self._session.sql(delete_sql).collect(
404
+ statement_params=_TELEMETRY_STATEMENT_PARAMS,
405
+ )
406
+ except snowpark_exceptions.SnowparkClientException as e:
407
+ raise snowml_exceptions.SnowflakeMLException(
408
+ error_code=error_codes.SNOWML_DELETE_FAILED,
409
+ original_exception=dataset_errors.DatasetCannotDeleteError(str(e)),
410
+ ) from e
411
+ return
412
+
413
+ @telemetry.send_api_usage_telemetry(project=_PROJECT)
414
+ def delete(self) -> None:
415
+ """Delete Dataset and all contained versions"""
416
+ # TODO: Check and warn if any versions exist
417
+ self._session.sql(f"DROP DATASET {self.fully_qualified_name}").collect(
418
+ statement_params=_TELEMETRY_STATEMENT_PARAMS
419
+ )
420
+
421
+ def _list_versions(self, pattern: Optional[str] = None) -> List[snowpark.Row]:
422
+ """Return list of versions"""
423
+ try:
424
+ pattern_clause = f" LIKE '{pattern}'" if pattern else ""
425
+ return (
426
+ query_result_checker.SqlResultValidator(
427
+ self._session,
428
+ f"SHOW VERSIONS{pattern_clause} IN DATASET {self.fully_qualified_name}",
429
+ statement_params=_TELEMETRY_STATEMENT_PARAMS,
430
+ )
431
+ .has_column(_DATASET_VERSION_NAME_COL, allow_empty=True)
432
+ .validate()
433
+ )
434
+ except snowpark_exceptions.SnowparkClientException as e:
435
+ # Snowpark wraps the Python Connector error code in the head of the error message.
436
+ if e.message.startswith(dataset_errors.ERRNO_OBJECT_NOT_EXIST):
437
+ raise snowml_exceptions.SnowflakeMLException(
438
+ error_code=error_codes.NOT_FOUND,
439
+ original_exception=dataset_errors.DatasetNotExistError(
440
+ dataset_error_messages.DATASET_NOT_EXIST.format(self.fully_qualified_name)
441
+ ),
442
+ ) from e
443
+ else:
444
+ raise
445
+
446
+ def _validate_version_exists(self, version: str) -> None:
447
+ """Verify that the requested version exists. Raises DatasetNotExist if version not found"""
448
+ matches = self._list_versions(version)
449
+ matches = [m for m in matches if m[_DATASET_VERSION_NAME_COL] == version] # Case sensitive match
450
+ if len(matches) == 0:
451
+ raise snowml_exceptions.SnowflakeMLException(
452
+ error_code=error_codes.NOT_FOUND,
453
+ original_exception=dataset_errors.DatasetNotExistError(
454
+ dataset_error_messages.DATASET_VERSION_NOT_EXIST.format(self.fully_qualified_name, version)
455
+ ),
456
+ )
457
+
458
+
459
+ # Utility methods
460
+
461
+
462
+ def _get_schema_level_identifier(session: snowpark.Session, dataset_name: str) -> Tuple[str, str, str]:
463
+ """Resolve a dataset name into a validated schema-level location identifier"""
464
+ db, schema, object_name, others = identifier.parse_schema_level_object_identifier(dataset_name)
465
+ if others:
466
+ raise ValueError(f"Invalid identifier: unexpected '{others}'")
467
+ db = db or session.get_current_database()
468
+ schema = schema or session.get_current_schema()
469
+ return str(db), str(schema), str(object_name)
470
+
471
+
472
+ def _validate_dataset_exists(session: snowpark.Session, db: str, schema: str, dataset_name: str) -> None:
473
+ # FIXME: Once we switch version to SQL Identifiers we can just use version check with version=''
474
+ dataset_name = identifier.resolve_identifier(dataset_name)
475
+ if len(dataset_name) > 0 and dataset_name[0] == '"' and dataset_name[-1] == '"':
476
+ dataset_name = identifier.get_unescaped_names(dataset_name)
477
+ # Case sensitive match
478
+ query = f"show datasets like '{dataset_name}' in schema {db}.{schema} starts with '{dataset_name}'"
479
+ ds_matches = session.sql(query).count()
480
+ if ds_matches == 0:
481
+ raise snowml_exceptions.SnowflakeMLException(
482
+ error_code=error_codes.NOT_FOUND,
483
+ original_exception=dataset_errors.DatasetNotExistError(
484
+ dataset_error_messages.DATASET_NOT_EXIST.format(dataset_name)
485
+ ),
486
+ )
@@ -0,0 +1,53 @@
1
+ from typing import Any
2
+
3
+ from snowflake import snowpark
4
+ from snowflake.ml._internal import telemetry
5
+ from snowflake.ml.dataset import dataset
6
+
7
+ _PROJECT = "Dataset"
8
+
9
+
10
+ @telemetry.send_api_usage_telemetry(project=_PROJECT)
11
+ def create_from_dataframe(
12
+ session: snowpark.Session,
13
+ name: str,
14
+ version: str,
15
+ input_dataframe: snowpark.DataFrame,
16
+ **version_kwargs: Any,
17
+ ) -> dataset.Dataset:
18
+ """
19
+ Create a new versioned Dataset from a DataFrame and returns
20
+ a DatasetReader for the newly created Dataset version.
21
+
22
+ Args:
23
+ session: The Snowpark Session instance to use.
24
+ name: The dataset name
25
+ version: The dataset version name
26
+ input_dataframe: DataFrame containing data to be saved to the created Dataset.
27
+ version_kwargs: Keyword arguments passed to dataset version creation.
28
+ See `Dataset.create_version()` documentation for supported arguments.
29
+
30
+ Returns:
31
+ A Dataset object.
32
+ """
33
+ ds: dataset.Dataset = dataset.Dataset.create(session, name, exist_ok=True)
34
+ ds.create_version(version, input_dataframe=input_dataframe, **version_kwargs)
35
+ ds = ds.select_version(version) # select_version returns a new copy
36
+ return ds
37
+
38
+
39
+ @telemetry.send_api_usage_telemetry(project=_PROJECT)
40
+ def load_dataset(session: snowpark.Session, name: str, version: str) -> dataset.Dataset:
41
+ """
42
+ Load a versioned Dataset into a DatasetReader.
43
+
44
+ Args:
45
+ session: The Snowpark Session instance to use.
46
+ name: The dataset name.
47
+ version: The dataset version name.
48
+
49
+ Returns:
50
+ A DatasetReader object.
51
+ """
52
+ ds: dataset.Dataset = dataset.Dataset.load(session, name).select_version(version)
53
+ return ds