snowflake-ml-python 1.5.3__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. snowflake/cortex/__init__.py +4 -1
  2. snowflake/cortex/_classify_text.py +36 -0
  3. snowflake/cortex/_complete.py +281 -21
  4. snowflake/cortex/_extract_answer.py +0 -1
  5. snowflake/cortex/_sentiment.py +0 -1
  6. snowflake/cortex/_summarize.py +0 -1
  7. snowflake/cortex/_translate.py +0 -1
  8. snowflake/cortex/_util.py +12 -85
  9. snowflake/ml/_internal/container_services/image_registry/http_client.py +10 -3
  10. snowflake/ml/_internal/container_services/image_registry/imagelib.py +23 -10
  11. snowflake/ml/_internal/container_services/image_registry/registry_client.py +7 -1
  12. snowflake/ml/_internal/exceptions/dataset_errors.py +7 -7
  13. snowflake/ml/_internal/exceptions/fileset_errors.py +3 -3
  14. snowflake/ml/_internal/exceptions/sql_error_codes.py +6 -0
  15. snowflake/ml/_internal/lineage/lineage_utils.py +4 -4
  16. snowflake/ml/_internal/telemetry.py +38 -2
  17. snowflake/ml/_internal/utils/identifier.py +14 -0
  18. snowflake/ml/_internal/utils/snowpark_dataframe_utils.py +15 -4
  19. snowflake/ml/data/_internal/arrow_ingestor.py +228 -0
  20. snowflake/ml/data/_internal/ingestor_utils.py +58 -0
  21. snowflake/ml/data/data_connector.py +133 -0
  22. snowflake/ml/data/data_ingestor.py +28 -0
  23. snowflake/ml/data/data_source.py +23 -0
  24. snowflake/ml/dataset/dataset.py +39 -32
  25. snowflake/ml/dataset/dataset_reader.py +18 -118
  26. snowflake/ml/feature_store/access_manager.py +7 -1
  27. snowflake/ml/feature_store/entity.py +19 -2
  28. snowflake/ml/feature_store/examples/citibike_trip_features/entities.py +20 -0
  29. snowflake/ml/feature_store/examples/citibike_trip_features/features/station_feature.py +31 -0
  30. snowflake/ml/feature_store/examples/citibike_trip_features/features/trip_feature.py +24 -0
  31. snowflake/ml/feature_store/examples/citibike_trip_features/source.yaml +4 -0
  32. snowflake/ml/feature_store/examples/example_helper.py +240 -0
  33. snowflake/ml/feature_store/examples/new_york_taxi_features/entities.py +12 -0
  34. snowflake/ml/feature_store/examples/new_york_taxi_features/features/dropoff_features.py +39 -0
  35. snowflake/ml/feature_store/examples/new_york_taxi_features/features/pickup_features.py +58 -0
  36. snowflake/ml/feature_store/examples/new_york_taxi_features/source.yaml +5 -0
  37. snowflake/ml/feature_store/examples/source_data/citibike_trips.yaml +36 -0
  38. snowflake/ml/feature_store/examples/source_data/fraud_transactions.yaml +29 -0
  39. snowflake/ml/feature_store/examples/source_data/nyc_yellow_trips.yaml +4 -0
  40. snowflake/ml/feature_store/examples/source_data/winequality_red.yaml +32 -0
  41. snowflake/ml/feature_store/examples/wine_quality_features/entities.py +14 -0
  42. snowflake/ml/feature_store/examples/wine_quality_features/features/managed_wine_features.py +29 -0
  43. snowflake/ml/feature_store/examples/wine_quality_features/features/static_wine_features.py +21 -0
  44. snowflake/ml/feature_store/examples/wine_quality_features/source.yaml +5 -0
  45. snowflake/ml/feature_store/feature_store.py +987 -264
  46. snowflake/ml/feature_store/feature_view.py +228 -13
  47. snowflake/ml/fileset/embedded_stage_fs.py +25 -21
  48. snowflake/ml/fileset/fileset.py +2 -2
  49. snowflake/ml/fileset/snowfs.py +4 -15
  50. snowflake/ml/fileset/stage_fs.py +24 -18
  51. snowflake/ml/lineage/__init__.py +3 -0
  52. snowflake/ml/lineage/lineage_node.py +139 -0
  53. snowflake/ml/model/_client/model/model_impl.py +47 -14
  54. snowflake/ml/model/_client/model/model_version_impl.py +82 -2
  55. snowflake/ml/model/_client/ops/model_ops.py +77 -5
  56. snowflake/ml/model/_client/sql/model.py +1 -0
  57. snowflake/ml/model/_client/sql/model_version.py +45 -2
  58. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +4 -6
  59. snowflake/ml/model/_model_composer/model_composer.py +15 -17
  60. snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +31 -17
  61. snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +2 -1
  62. snowflake/ml/model/_model_composer/model_method/function_generator.py +20 -4
  63. snowflake/ml/model/_model_composer/model_method/infer_function.py_template +3 -32
  64. snowflake/ml/model/_model_composer/model_method/infer_partitioned.py_template +55 -0
  65. snowflake/ml/model/_model_composer/model_method/infer_table_function.py_template +5 -34
  66. snowflake/ml/model/_model_composer/model_method/model_method.py +10 -7
  67. snowflake/ml/model/_packager/model_handlers/_base.py +13 -3
  68. snowflake/ml/model/_packager/model_handlers/_utils.py +59 -1
  69. snowflake/ml/model/_packager/model_handlers/catboost.py +44 -2
  70. snowflake/ml/model/_packager/model_handlers/custom.py +12 -4
  71. snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +18 -15
  72. snowflake/ml/model/_packager/model_handlers/lightgbm.py +70 -2
  73. snowflake/ml/model/_packager/model_handlers/llm.py +2 -2
  74. snowflake/ml/model/_packager/model_handlers/mlflow.py +2 -2
  75. snowflake/ml/model/_packager/model_handlers/pytorch.py +2 -2
  76. snowflake/ml/model/_packager/model_handlers/sentence_transformers.py +2 -2
  77. snowflake/ml/model/_packager/model_handlers/sklearn.py +2 -2
  78. snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +2 -2
  79. snowflake/ml/model/_packager/model_handlers/tensorflow.py +2 -2
  80. snowflake/ml/model/_packager/model_handlers/torchscript.py +2 -2
  81. snowflake/ml/model/_packager/model_handlers/xgboost.py +61 -2
  82. snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
  83. snowflake/ml/model/_packager/model_meta/model_blob_meta.py +2 -0
  84. snowflake/ml/model/_packager/model_meta/model_meta.py +21 -1
  85. snowflake/ml/model/_packager/model_meta/model_meta_schema.py +6 -1
  86. snowflake/ml/model/_packager/model_packager.py +9 -4
  87. snowflake/ml/model/_packager/model_runtime/_snowml_inference_alternative_requirements.py +1 -1
  88. snowflake/ml/model/_packager/model_runtime/model_runtime.py +3 -5
  89. snowflake/ml/model/custom_model.py +22 -2
  90. snowflake/ml/model/model_signature.py +4 -4
  91. snowflake/ml/model/type_hints.py +77 -4
  92. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +3 -1
  93. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_search_udf_file.py +13 -1
  94. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +1 -0
  95. snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +6 -0
  96. snowflake/ml/modeling/_internal/snowpark_implementations/xgboost_external_memory_trainer.py +1 -0
  97. snowflake/ml/modeling/cluster/affinity_propagation.py +4 -2
  98. snowflake/ml/modeling/cluster/agglomerative_clustering.py +4 -2
  99. snowflake/ml/modeling/cluster/birch.py +4 -2
  100. snowflake/ml/modeling/cluster/bisecting_k_means.py +4 -2
  101. snowflake/ml/modeling/cluster/dbscan.py +4 -2
  102. snowflake/ml/modeling/cluster/feature_agglomeration.py +4 -2
  103. snowflake/ml/modeling/cluster/k_means.py +4 -2
  104. snowflake/ml/modeling/cluster/mean_shift.py +4 -2
  105. snowflake/ml/modeling/cluster/mini_batch_k_means.py +4 -2
  106. snowflake/ml/modeling/cluster/optics.py +4 -2
  107. snowflake/ml/modeling/cluster/spectral_biclustering.py +4 -2
  108. snowflake/ml/modeling/cluster/spectral_clustering.py +4 -2
  109. snowflake/ml/modeling/cluster/spectral_coclustering.py +4 -2
  110. snowflake/ml/modeling/compose/column_transformer.py +4 -2
  111. snowflake/ml/modeling/covariance/elliptic_envelope.py +4 -2
  112. snowflake/ml/modeling/covariance/empirical_covariance.py +4 -2
  113. snowflake/ml/modeling/covariance/graphical_lasso.py +4 -2
  114. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +4 -2
  115. snowflake/ml/modeling/covariance/ledoit_wolf.py +4 -2
  116. snowflake/ml/modeling/covariance/min_cov_det.py +4 -2
  117. snowflake/ml/modeling/covariance/oas.py +4 -2
  118. snowflake/ml/modeling/covariance/shrunk_covariance.py +4 -2
  119. snowflake/ml/modeling/decomposition/dictionary_learning.py +4 -2
  120. snowflake/ml/modeling/decomposition/factor_analysis.py +4 -2
  121. snowflake/ml/modeling/decomposition/fast_ica.py +4 -2
  122. snowflake/ml/modeling/decomposition/incremental_pca.py +4 -2
  123. snowflake/ml/modeling/decomposition/kernel_pca.py +4 -2
  124. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +4 -2
  125. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +4 -2
  126. snowflake/ml/modeling/decomposition/pca.py +4 -2
  127. snowflake/ml/modeling/decomposition/sparse_pca.py +4 -2
  128. snowflake/ml/modeling/decomposition/truncated_svd.py +4 -2
  129. snowflake/ml/modeling/ensemble/isolation_forest.py +4 -2
  130. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +4 -2
  131. snowflake/ml/modeling/feature_selection/variance_threshold.py +4 -2
  132. snowflake/ml/modeling/impute/iterative_imputer.py +4 -2
  133. snowflake/ml/modeling/impute/knn_imputer.py +4 -2
  134. snowflake/ml/modeling/impute/missing_indicator.py +4 -2
  135. snowflake/ml/modeling/impute/simple_imputer.py +26 -0
  136. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +4 -2
  137. snowflake/ml/modeling/kernel_approximation/nystroem.py +4 -2
  138. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +4 -2
  139. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +4 -2
  140. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +4 -2
  141. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +4 -2
  142. snowflake/ml/modeling/manifold/isomap.py +4 -2
  143. snowflake/ml/modeling/manifold/mds.py +4 -2
  144. snowflake/ml/modeling/manifold/spectral_embedding.py +4 -2
  145. snowflake/ml/modeling/manifold/tsne.py +4 -2
  146. snowflake/ml/modeling/metrics/ranking.py +3 -0
  147. snowflake/ml/modeling/metrics/regression.py +3 -0
  148. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +4 -2
  149. snowflake/ml/modeling/mixture/gaussian_mixture.py +4 -2
  150. snowflake/ml/modeling/neighbors/kernel_density.py +4 -2
  151. snowflake/ml/modeling/neighbors/local_outlier_factor.py +4 -2
  152. snowflake/ml/modeling/neighbors/nearest_neighbors.py +4 -2
  153. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +4 -2
  154. snowflake/ml/modeling/pipeline/pipeline.py +5 -4
  155. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +43 -9
  156. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +36 -8
  157. snowflake/ml/modeling/preprocessing/polynomial_features.py +4 -2
  158. snowflake/ml/registry/_manager/model_manager.py +16 -3
  159. snowflake/ml/registry/registry.py +100 -13
  160. snowflake/ml/version.py +1 -1
  161. {snowflake_ml_python-1.5.3.dist-info → snowflake_ml_python-1.6.0.dist-info}/METADATA +81 -7
  162. {snowflake_ml_python-1.5.3.dist-info → snowflake_ml_python-1.6.0.dist-info}/RECORD +165 -139
  163. {snowflake_ml_python-1.5.3.dist-info → snowflake_ml_python-1.6.0.dist-info}/WHEEL +1 -1
  164. snowflake/ml/_internal/lineage/data_source.py +0 -10
  165. {snowflake_ml_python-1.5.3.dist-info → snowflake_ml_python-1.6.0.dist-info}/LICENSE.txt +0 -0
  166. {snowflake_ml_python-1.5.3.dist-info → snowflake_ml_python-1.6.0.dist-info}/top_level.txt +0 -0
@@ -101,16 +101,20 @@ class OneHotEncoder(base.BaseTransformer):
101
101
  (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html).
102
102
 
103
103
  Args:
104
- categories: 'auto' or dict {column_name: np.ndarray([category])}, default='auto'
104
+ categories: 'auto', list of array-like, or dict {column_name: np.ndarray([category])}, default='auto'
105
105
  Categories (unique values) per feature:
106
106
  - 'auto': Determine categories automatically from the training data.
107
+ - list: ``categories[i]`` holds the categories expected in the ith
108
+ column. The passed categories should not mix strings and numeric
109
+ values within a single feature, and should be sorted in case of
110
+ numeric values.
107
111
  - dict: ``categories[column_name]`` holds the categories expected in
108
112
  the column provided. The passed categories should not mix strings
109
113
  and numeric values within a single feature, and should be sorted in
110
114
  case of numeric values.
111
115
  The used categories can be found in the ``categories_`` attribute.
112
116
 
113
- drop: {first’, if_binary} or an array-like of shape (n_features,), default=None
117
+ drop: {'first', 'if_binary'} or an array-like of shape (n_features,), default=None
114
118
  Specifies a methodology to use to drop one of the categories per
115
119
  feature. This is useful in situations where perfectly collinear
116
120
  features cause problems, such as when feeding the resulting data
@@ -206,7 +210,7 @@ class OneHotEncoder(base.BaseTransformer):
206
210
  def __init__(
207
211
  self,
208
212
  *,
209
- categories: Union[str, Dict[str, type_utils.LiteralNDArrayType]] = "auto",
213
+ categories: Union[str, List[type_utils.LiteralNDArrayType], Dict[str, type_utils.LiteralNDArrayType]] = "auto",
210
214
  drop: Optional[Union[str, npt.ArrayLike]] = None,
211
215
  sparse: bool = False,
212
216
  handle_unknown: str = "error",
@@ -440,8 +444,19 @@ class OneHotEncoder(base.BaseTransformer):
440
444
  assert found_state_df is not None
441
445
  if self.categories != "auto":
442
446
  state_data = []
443
- assert isinstance(self.categories, dict)
444
- for input_col, cats in self.categories.items():
447
+ if isinstance(self.categories, list):
448
+ categories_map = {col_name: cats for col_name, cats in zip(self.input_cols, self.categories)}
449
+ elif isinstance(self.categories, dict):
450
+ categories_map = self.categories
451
+ else:
452
+ raise exceptions.SnowflakeMLException(
453
+ error_code=error_codes.INVALID_ARGUMENT,
454
+ original_exception=ValueError(
455
+ f"Invalid type {type(self.categories)} provided for argument `categories`"
456
+ ),
457
+ )
458
+
459
+ for input_col, cats in categories_map.items():
445
460
  for cat in cats.tolist():
446
461
  state_data.append([input_col, cat])
447
462
  # states of given categories
@@ -565,6 +580,8 @@ class OneHotEncoder(base.BaseTransformer):
565
580
  else:
566
581
  categories[k] = vectorized_func(v)
567
582
  self.categories_ = categories
583
+ elif isinstance(self.categories, list):
584
+ self.categories_ = {col_name: cats for col_name, cats in zip(self.input_cols, self.categories)}
568
585
  else:
569
586
  self.categories_ = self.categories
570
587
 
@@ -850,8 +867,15 @@ class OneHotEncoder(base.BaseTransformer):
850
867
  # In case of fitting with pandas dataframe and transforming with snowpark dataframe
851
868
  # state_pandas cannot recognize the datatype of _CATEGORY and _FITTED_CATEGORY column
852
869
  # Therefore, apply the convert_to_string_excluding_nan function to _CATEGORY and _FITTED_CATEGORY
853
- state_pandas[[_CATEGORY]] = state_pandas[[_CATEGORY]].applymap(convert_to_string_excluding_nan)
854
- state_pandas[[_FITTED_CATEGORY]] = state_pandas[[_FITTED_CATEGORY]].applymap(convert_to_string_excluding_nan)
870
+ # applymap is depreciated since pandas 2.1.0, replaced by map
871
+ if pd.__version__ < "2.1.0":
872
+ state_pandas[[_CATEGORY]] = state_pandas[[_CATEGORY]].applymap(convert_to_string_excluding_nan)
873
+ state_pandas[[_FITTED_CATEGORY]] = state_pandas[[_FITTED_CATEGORY]].applymap(
874
+ convert_to_string_excluding_nan
875
+ )
876
+ else:
877
+ state_pandas[[_CATEGORY]] = state_pandas[[_CATEGORY]].map(convert_to_string_excluding_nan)
878
+ state_pandas[[_FITTED_CATEGORY]] = state_pandas[[_FITTED_CATEGORY]].map(convert_to_string_excluding_nan)
855
879
  state_df = dataset._session.create_dataframe(state_pandas)
856
880
 
857
881
  transformed_dataset = dataset
@@ -1009,7 +1033,7 @@ class OneHotEncoder(base.BaseTransformer):
1009
1033
  error_code=error_codes.INVALID_ATTRIBUTE,
1010
1034
  original_exception=ValueError(f"Unsupported `categories` value: {self.categories}."),
1011
1035
  )
1012
- elif isinstance(self.categories, dict):
1036
+ elif isinstance(self.categories, (dict, list)):
1013
1037
  if len(self.categories) != len(self.input_cols):
1014
1038
  raise exceptions.SnowflakeMLException(
1015
1039
  error_code=error_codes.INVALID_ATTRIBUTE,
@@ -1018,7 +1042,7 @@ class OneHotEncoder(base.BaseTransformer):
1018
1042
  f"({len(self.input_cols)})."
1019
1043
  ),
1020
1044
  )
1021
- elif set(self.categories.keys()) != set(self.input_cols):
1045
+ elif isinstance(self.categories, dict) and set(self.categories.keys()) != set(self.input_cols):
1022
1046
  raise exceptions.SnowflakeMLException(
1023
1047
  error_code=error_codes.INVALID_ATTRIBUTE,
1024
1048
  original_exception=ValueError(
@@ -1537,6 +1561,16 @@ class OneHotEncoder(base.BaseTransformer):
1537
1561
  default_sklearn_args = _utils.get_default_args(default_sklearn_obj.__class__.__init__)
1538
1562
  given_args = self.get_params()
1539
1563
 
1564
+ if "categories" in given_args and isinstance(given_args["categories"], dict):
1565
+ # sklearn requires a list of array-like to satisfy the `categories` arg
1566
+ try:
1567
+ given_args["categories"] = [given_args["categories"][input_col] for input_col in self.input_cols]
1568
+ except KeyError as e:
1569
+ raise exceptions.SnowflakeMLException(
1570
+ error_code=error_codes.INVALID_ARGUMENT,
1571
+ original_exception=e,
1572
+ )
1573
+
1540
1574
  # replace 'sparse' with 'sparse_output' when scikit-learn>=1.2
1541
1575
  sklearn_version = sklearn.__version__
1542
1576
  if version.parse(sklearn_version) >= version.parse(_SKLEARN_DEPRECATED_KEYWORD_TO_VERSION_DICT["sparse"]):
@@ -45,9 +45,11 @@ class OrdinalEncoder(base.BaseTransformer):
45
45
  (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html).
46
46
 
47
47
  Args:
48
- categories: Union[str, Dict[str, type_utils.LiteralNDArrayType]], default="auto"
48
+ categories: Union[str, List[type_utils.LiteralNDArrayType], Dict[str, type_utils.LiteralNDArrayType]],
49
+ default="auto"
49
50
  The string 'auto' (the default) causes the categories to be extracted from the input columns.
50
- To specify the categories yourself, pass a dictionary mapping the column name to an ndarray containing the
51
+ To specify the categories yourself, pass either (1) a list of ndarrays containing the categories or
52
+ (2) a dictionary mapping the column name to an ndarray containing the
51
53
  categories.
52
54
 
53
55
  handle_unknown: str, default="error"
@@ -96,7 +98,7 @@ class OrdinalEncoder(base.BaseTransformer):
96
98
  def __init__(
97
99
  self,
98
100
  *,
99
- categories: Union[str, Dict[str, type_utils.LiteralNDArrayType]] = "auto",
101
+ categories: Union[str, List[type_utils.LiteralNDArrayType], Dict[str, type_utils.LiteralNDArrayType]] = "auto",
100
102
  handle_unknown: str = "error",
101
103
  unknown_value: Optional[Union[int, float]] = None,
102
104
  encoded_missing_value: Union[int, float] = np.nan,
@@ -114,9 +116,13 @@ class OrdinalEncoder(base.BaseTransformer):
114
116
  a single column of integers (0 to n_categories - 1) per feature.
115
117
 
116
118
  Args:
117
- categories: 'auto' or dict {column_name: ndarray([category])}, default='auto'
119
+ categories: 'auto', list of array-like, or dict {column_name: ndarray([category])}, default='auto'
118
120
  Categories (unique values) per feature:
119
121
  - 'auto': Determine categories automatically from the training data.
122
+ - list: ``categories[i]`` holds the categories expected in the ith
123
+ column. The passed categories should not mix strings and numeric
124
+ values within a single feature, and should be sorted in case of
125
+ numeric values.
120
126
  - dict: ``categories[column_name]`` holds the categories expected in
121
127
  the column provided. The passed categories should not mix strings
122
128
  and numeric values within a single feature, and should be sorted in
@@ -317,8 +323,19 @@ class OrdinalEncoder(base.BaseTransformer):
317
323
  assert found_state_df is not None
318
324
  if self.categories != "auto":
319
325
  state_data = []
320
- assert isinstance(self.categories, dict)
321
- for input_col, cats in self.categories.items():
326
+ if isinstance(self.categories, list):
327
+ categories_map = {col_name: cats for col_name, cats in zip(self.input_cols, self.categories)}
328
+ elif isinstance(self.categories, dict):
329
+ categories_map = self.categories
330
+ else:
331
+ raise exceptions.SnowflakeMLException(
332
+ error_code=error_codes.INVALID_ARGUMENT,
333
+ original_exception=ValueError(
334
+ f"Invalid type {type(self.categories)} provided for argument `categories`"
335
+ ),
336
+ )
337
+
338
+ for input_col, cats in categories_map.items():
322
339
  for idx, cat in enumerate(cats.tolist()):
323
340
  state_data.append([input_col, cat, idx])
324
341
  # states of given categories
@@ -368,6 +385,8 @@ class OrdinalEncoder(base.BaseTransformer):
368
385
  for col_name, cats in grouped_categories.items()
369
386
  }
370
387
  self.categories_ = categories
388
+ elif isinstance(self.categories, list):
389
+ self.categories_ = {col_name: cats for col_name, cats in zip(self.input_cols, self.categories)}
371
390
  else:
372
391
  self.categories_ = self.categories
373
392
 
@@ -548,6 +567,15 @@ class OrdinalEncoder(base.BaseTransformer):
548
567
  snowml_only_keywords=_SNOWML_ONLY_KEYWORDS,
549
568
  sklearn_added_keyword_to_version_dict=_SKLEARN_ADDED_KEYWORD_TO_VERSION_DICT,
550
569
  )
570
+ if "categories" in sklearn_args and isinstance(sklearn_args["categories"], dict):
571
+ # sklearn requires a list of array-like to satisfy the `categories` arg
572
+ try:
573
+ sklearn_args["categories"] = [sklearn_args["categories"][input_col] for input_col in self.input_cols]
574
+ except KeyError as e:
575
+ raise exceptions.SnowflakeMLException(
576
+ error_code=error_codes.INVALID_ARGUMENT,
577
+ original_exception=e,
578
+ )
551
579
  return preprocessing.OrdinalEncoder(**sklearn_args)
552
580
 
553
581
  def _create_sklearn_object(self) -> preprocessing.OrdinalEncoder:
@@ -570,7 +598,7 @@ class OrdinalEncoder(base.BaseTransformer):
570
598
  error_code=error_codes.INVALID_ATTRIBUTE,
571
599
  original_exception=ValueError(f"Unsupported `categories` value: {self.categories}."),
572
600
  )
573
- elif isinstance(self.categories, dict):
601
+ elif isinstance(self.categories, (dict, list)):
574
602
  if len(self.categories) != len(self.input_cols):
575
603
  raise exceptions.SnowflakeMLException(
576
604
  error_code=error_codes.INVALID_ATTRIBUTE,
@@ -579,7 +607,7 @@ class OrdinalEncoder(base.BaseTransformer):
579
607
  f"({len(self.input_cols)})."
580
608
  ),
581
609
  )
582
- elif set(self.categories.keys()) != set(self.input_cols):
610
+ elif isinstance(self.categories, dict) and set(self.categories.keys()) != set(self.input_cols):
583
611
  raise exceptions.SnowflakeMLException(
584
612
  error_code=error_codes.INVALID_ATTRIBUTE,
585
613
  original_exception=ValueError(
@@ -76,8 +76,10 @@ class PolynomialFeatures(BaseTransformer):
76
76
  initialization with the `set_input_cols` method.
77
77
 
78
78
  label_cols: Optional[Union[str, List[str]]]
79
- This parameter is optional and will be ignored during fit. It is present here for API consistency by convention.
80
-
79
+ A string or list of strings representing column names that contain labels.
80
+ Label columns must be specified with this parameter during initialization
81
+ or with the `set_label_cols` method before fitting.
82
+
81
83
  output_cols: Optional[Union[str, List[str]]]
82
84
  A string or list of strings representing column names that will store the
83
85
  output of predict and transform operations. The length of output_cols must
@@ -4,12 +4,14 @@ from typing import Any, Dict, List, Optional, Union
4
4
  import pandas as pd
5
5
  from absl.logging import logging
6
6
 
7
+ from snowflake.ml._internal import telemetry
7
8
  from snowflake.ml._internal.human_readable_id import hrid_generator
8
9
  from snowflake.ml._internal.utils import sql_identifier
9
10
  from snowflake.ml.model import model_signature, type_hints as model_types
10
11
  from snowflake.ml.model._client.model import model_impl, model_version_impl
11
12
  from snowflake.ml.model._client.ops import metadata_ops, model_ops
12
13
  from snowflake.ml.model._model_composer import model_composer
14
+ from snowflake.ml.model._packager.model_meta import model_meta
13
15
  from snowflake.snowpark import session
14
16
 
15
17
  logger = logging.getLogger(__name__)
@@ -124,7 +126,10 @@ class ModelManager:
124
126
  version_name=version_name_id,
125
127
  statement_params=statement_params,
126
128
  ):
127
- raise ValueError(f"Model {model_name} version {version_name} already existed.")
129
+ raise ValueError(
130
+ f"Model {model_name} version {version_name} already existed. "
131
+ + "To auto-generate `version_name`, skip that argument."
132
+ )
128
133
 
129
134
  stage_path = self._model_ops.prepare_model_stage_path(
130
135
  database_name=database_name_id,
@@ -134,8 +139,10 @@ class ModelManager:
134
139
 
135
140
  logger.info("Start packaging and uploading your model. It might take some time based on the size of the model.")
136
141
 
137
- mc = model_composer.ModelComposer(self._model_ops._session, stage_path=stage_path)
138
- mc.save(
142
+ mc = model_composer.ModelComposer(
143
+ self._model_ops._session, stage_path=stage_path, statement_params=statement_params
144
+ )
145
+ model_metadata: model_meta.ModelMetadata = mc.save(
139
146
  name=model_name_id.resolved(),
140
147
  model=model,
141
148
  signatures=signatures,
@@ -147,6 +154,12 @@ class ModelManager:
147
154
  ext_modules=ext_modules,
148
155
  options=options,
149
156
  )
157
+ statement_params = telemetry.add_statement_params_custom_tags(
158
+ statement_params, model_metadata.telemetry_metadata()
159
+ )
160
+ statement_params = telemetry.add_statement_params_custom_tags(
161
+ statement_params, {"model_version_name": version_name_id}
162
+ )
150
163
 
151
164
  logger.info("Start creating MODEL object for you in the Snowflake.")
152
165
 
@@ -1,5 +1,6 @@
1
+ import warnings
1
2
  from types import ModuleType
2
- from typing import Any, Dict, List, Optional
3
+ from typing import Any, Dict, List, Optional, Union, overload
3
4
 
4
5
  import pandas as pd
5
6
 
@@ -68,6 +69,90 @@ class Registry:
68
69
  """Get the location (database.schema) of the registry."""
69
70
  return ".".join([self._database_name.identifier(), self._schema_name.identifier()])
70
71
 
72
+ @overload
73
+ def log_model(
74
+ self,
75
+ model: model_types.SupportedModelType,
76
+ *,
77
+ model_name: str,
78
+ version_name: Optional[str] = None,
79
+ comment: Optional[str] = None,
80
+ metrics: Optional[Dict[str, Any]] = None,
81
+ conda_dependencies: Optional[List[str]] = None,
82
+ pip_requirements: Optional[List[str]] = None,
83
+ python_version: Optional[str] = None,
84
+ signatures: Optional[Dict[str, model_signature.ModelSignature]] = None,
85
+ sample_input_data: Optional[model_types.SupportedDataType] = None,
86
+ code_paths: Optional[List[str]] = None,
87
+ ext_modules: Optional[List[ModuleType]] = None,
88
+ options: Optional[model_types.ModelSaveOption] = None,
89
+ ) -> ModelVersion:
90
+ """
91
+ Log a model with various parameters and metadata.
92
+
93
+ Args:
94
+ model: Model object of supported types such as Scikit-learn, XGBoost, LightGBM, Snowpark ML,
95
+ PyTorch, TorchScript, Tensorflow, Tensorflow Keras, MLFlow, HuggingFace Pipeline,
96
+ Sentence Transformers, Peft-finetuned LLM, or Custom Model.
97
+ model_name: Name to identify the model.
98
+ version_name: Version identifier for the model. Combination of model_name and version_name must be unique.
99
+ If not specified, a random name will be generated.
100
+ comment: Comment associated with the model version. Defaults to None.
101
+ metrics: A JSON serializable dictionary containing metrics linked to the model version. Defaults to None.
102
+ signatures: Model data signatures for inputs and outputs for various target methods. If it is None,
103
+ sample_input_data would be used to infer the signatures for those models that cannot automatically
104
+ infer the signature. If not None, sample_input_data should not be specified. Defaults to None.
105
+ sample_input_data: Sample input data to infer model signatures from. Defaults to None.
106
+ conda_dependencies: List of Conda package specifications. Use "[channel::]package [operator version]" syntax
107
+ to specify a dependency. It is a recommended way to specify your dependencies using conda. When channel
108
+ is not specified, Snowflake Anaconda Channel will be used. Defaults to None.
109
+ pip_requirements: List of Pip package specifications. Defaults to None.
110
+ Currently it is not supported since Model can only executed in Snowflake Warehouse where all
111
+ dependencies are required to be retrieved from Snowflake Anaconda Channel.
112
+ python_version: Python version in which the model is run. Defaults to None.
113
+ code_paths: List of directories containing code to import. Defaults to None.
114
+ ext_modules: List of external modules to pickle with the model object.
115
+ Only supported when logging the following types of model:
116
+ Scikit-learn, Snowpark ML, PyTorch, TorchScript and Custom Model. Defaults to None.
117
+ options (Dict[str, Any], optional): Additional model saving options.
118
+ Model Saving Options include:
119
+ - embed_local_ml_library: Embed local Snowpark ML into the code directory or folder.
120
+ Override to True if the local Snowpark ML version is not available in the Snowflake Anaconda
121
+ Channel. Otherwise, defaults to False
122
+ - relax_version: Whether or not relax the version constraints of the dependencies.
123
+ It detects any ==x.y.z in specifiers and replaced with >=x.y, <(x+1). Defaults to True.
124
+ - function_type: Set the method function type globally. To set method function types individually see
125
+ function_type in model_options.
126
+ - method_options: Per-method saving options including:
127
+ - case_sensitive: Indicates whether the method and its signature should be case sensitive.
128
+ This means when you refer the method in the SQL, you need to double quote it.
129
+ This will be helpful if you need case to tell apart your methods or features, or you have
130
+ non-alphabetic characters in your method or feature name. Defaults to False.
131
+ - max_batch_size: Maximum batch size that the method could accept in the Snowflake Warehouse.
132
+ Defaults to None, determined automatically by Snowflake.
133
+ - function_type: One of supported model method function types (FUNCTION or TABLE_FUNCTION).
134
+ """
135
+ ...
136
+
137
+ @overload
138
+ def log_model(
139
+ self,
140
+ model: ModelVersion,
141
+ *,
142
+ model_name: str,
143
+ version_name: Optional[str] = None,
144
+ ) -> ModelVersion:
145
+ """
146
+ Log a model with a ModelVersion object.
147
+
148
+ Args:
149
+ model: Source ModelVersion object used to create the new ModelVersion object.
150
+ model_name: Name to identify the model.
151
+ version_name: Version identifier for the model. Combination of model_name and version_name must be unique.
152
+ If not specified, a random name will be generated.
153
+ """
154
+ ...
155
+
71
156
  @telemetry.send_api_usage_telemetry(
72
157
  project=_TELEMETRY_PROJECT,
73
158
  subproject=_MODEL_TELEMETRY_SUBPROJECT,
@@ -84,7 +169,7 @@ class Registry:
84
169
  )
85
170
  def log_model(
86
171
  self,
87
- model: model_types.SupportedModelType,
172
+ model: Union[model_types.SupportedModelType, ModelVersion],
88
173
  *,
89
174
  model_name: str,
90
175
  version_name: Optional[str] = None,
@@ -100,12 +185,14 @@ class Registry:
100
185
  options: Optional[model_types.ModelSaveOption] = None,
101
186
  ) -> ModelVersion:
102
187
  """
103
- Log a model with various parameters and metadata.
188
+ Log a model with various parameters and metadata, or a ModelVersion object.
104
189
 
105
190
  Args:
106
- model: Model object of supported types such as Scikit-learn, XGBoost, LightGBM, Snowpark ML,
107
- PyTorch, TorchScript, Tensorflow, Tensorflow Keras, MLFlow, HuggingFace Pipeline,
108
- Sentence Transformers, Peft-finetuned LLM, or Custom Model.
191
+ model: Supported model or ModelVersion object.
192
+ - Supported model: Model object of supported types such as Scikit-learn, XGBoost, LightGBM, Snowpark ML,
193
+ PyTorch, TorchScript, Tensorflow, Tensorflow Keras, MLFlow, HuggingFace Pipeline, Sentence Transformers,
194
+ Peft-finetuned LLM, or Custom Model.
195
+ - ModelVersion: Source ModelVersion object used to create the new ModelVersion object.
109
196
  model_name: Name to identify the model.
110
197
  version_name: Version identifier for the model. Combination of model_name and version_name must be unique.
111
198
  If not specified, a random name will be generated.
@@ -146,9 +233,6 @@ class Registry:
146
233
  Defaults to None, determined automatically by Snowflake.
147
234
  - function_type: One of supported model method function types (FUNCTION or TABLE_FUNCTION).
148
235
 
149
- Raises:
150
- NotImplementedError: `pip_requirements` is not supported.
151
-
152
236
  Returns:
153
237
  ModelVersion: ModelVersion object corresponding to the model just logged.
154
238
  """
@@ -157,10 +241,13 @@ class Registry:
157
241
  subproject=_MODEL_TELEMETRY_SUBPROJECT,
158
242
  )
159
243
  if pip_requirements:
160
- raise NotImplementedError(
161
- "Currently `pip_requirements` is not supported since Model can only executed "
244
+ warnings.warn(
245
+ "Models logged specifying `pip_requirements` can not be executed "
162
246
  "in Snowflake Warehouse where all dependencies are required to be retrieved "
163
- "from Snowflake Anaconda Channel."
247
+ "from Snowflake Anaconda Channel. Specify model save option `include_pip_dependencies`"
248
+ "to log model with pip dependencies.",
249
+ category=UserWarning,
250
+ stacklevel=1,
164
251
  )
165
252
  return self._model_manager.log_model(
166
253
  model=model,
@@ -169,7 +256,7 @@ class Registry:
169
256
  comment=comment,
170
257
  metrics=metrics,
171
258
  conda_dependencies=conda_dependencies,
172
- pip_requirements=None,
259
+ pip_requirements=pip_requirements,
173
260
  python_version=python_version,
174
261
  signatures=signatures,
175
262
  sample_input_data=sample_input_data,
snowflake/ml/version.py CHANGED
@@ -1 +1 @@
1
- VERSION="1.5.3"
1
+ VERSION="1.6.0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: snowflake-ml-python
3
- Version: 1.5.3
3
+ Version: 1.6.0
4
4
  Summary: The machine learning client library that is used for interacting with Snowflake to build machine learning solutions.
5
5
  Author-email: "Snowflake, Inc" <support@snowflake.com>
6
6
  License:
@@ -250,7 +250,7 @@ Requires-Dist: s3fs <2024,>=2022.11
250
250
  Requires-Dist: scikit-learn <1.4,>=1.2.1
251
251
  Requires-Dist: scipy <2,>=1.9
252
252
  Requires-Dist: snowflake-connector-python[pandas] <4,>=3.5.0
253
- Requires-Dist: snowflake-snowpark-python <2,>=1.15.0
253
+ Requires-Dist: snowflake-snowpark-python <2,>=1.17.0
254
254
  Requires-Dist: sqlparse <1,>=0.4
255
255
  Requires-Dist: typing-extensions <5,>=4.1.0
256
256
  Requires-Dist: xgboost <2,>=1.7.3
@@ -264,7 +264,7 @@ Requires-Dist: sentencepiece <1,>=0.1.95 ; extra == 'all'
264
264
  Requires-Dist: shap ==0.42.1 ; extra == 'all'
265
265
  Requires-Dist: tensorflow <3,>=2.10 ; extra == 'all'
266
266
  Requires-Dist: tokenizers <1,>=0.10 ; extra == 'all'
267
- Requires-Dist: torch <3,>=2.0.1 ; extra == 'all'
267
+ Requires-Dist: torch <2.3.0,>=2.0.1 ; extra == 'all'
268
268
  Requires-Dist: torchdata <1,>=0.4 ; extra == 'all'
269
269
  Requires-Dist: transformers <5,>=4.32.1 ; extra == 'all'
270
270
  Provides-Extra: catboost
@@ -280,7 +280,7 @@ Requires-Dist: shap ==0.42.1 ; extra == 'shap'
280
280
  Provides-Extra: tensorflow
281
281
  Requires-Dist: tensorflow <3,>=2.10 ; extra == 'tensorflow'
282
282
  Provides-Extra: torch
283
- Requires-Dist: torch <3,>=2.0.1 ; extra == 'torch'
283
+ Requires-Dist: torch <2.3.0,>=2.0.1 ; extra == 'torch'
284
284
  Requires-Dist: torchdata <1,>=0.4 ; extra == 'torch'
285
285
  Provides-Extra: transformers
286
286
  Requires-Dist: sentence-transformers <3,>=2.2.2 ; extra == 'transformers'
@@ -373,7 +373,83 @@ be compatibility issues. Server-side functionality that `snowflake-ml-python` de
373
373
 
374
374
  # Release History
375
375
 
376
- ## 1.5.3
376
+ ## 1.6.0
377
+
378
+ ### Bug Fixes
379
+
380
+ - Modeling: `SimpleImputer` can impute integer columns with integer values.
381
+ - Registry: Fix an issue when providing a pandas Dataframe whose index is not starting from 0 as the input to
382
+ the `ModelVersion.run`.
383
+
384
+ ### New Features
385
+
386
+ - Feature Store: Add overloads to APIs accept both object and name/version. Impacted APIs include read_feature_view(),
387
+ refresh_feature_view(), get_refresh_history(), resume_feature_view(), suspend_feature_view(), delete_feature_view().
388
+ - Feature Store: Add docstring inline examples for all public APIs.
389
+ - Feature Store: Add new utility class `ExampleHelper` to help with load source data to simplify public notebooks.
390
+ - Registry: Option to `enable_explainability` when registering XGBoost models as a pre-PuPr feature.
391
+ - Feature Store: add new API `update_entity()`.
392
+ - Registry: Option to `enable_explainability` when registering Catboost models as a pre-PuPr feature.
393
+ - Feature Store: Add new argument warehouse to FeatureView constructor to overwrite the default warehouse. Also add
394
+ a new column 'warehouse' to the output of list_feature_views().
395
+ - Registry: Add support for logging model from a model version.
396
+ - Modeling: Distributed Hyperparameter Optimization now announce GA refresh version. The latest memory efficient version
397
+ will not have the 10GB training limitation for dataset any more. To turn off, please run
398
+ `
399
+ from snowflake.ml.modeling._internal.snowpark_implementations import (
400
+ distributed_hpo_trainer,
401
+ )
402
+ distributed_hpo_trainer.ENABLE_EFFICIENT_MEMORY_USAGE = False
403
+ `
404
+ - Registry: Option to `enable_explainability` when registering LightGBM models as a pre-PuPr feature.
405
+
406
+ ### Behavior Changes
407
+
408
+ - Feature Store: change some positional parameters to keyword arguments in following APIs:
409
+ - Entity(): desc.
410
+ - FeatureView(): timestamp_col, refresh_freq, desc.
411
+ - FeatureStore(): creation_mode.
412
+ - update_entity(): desc.
413
+ - register_feature_view(): block, overwrite.
414
+ - list_feature_views(): entity_name, feature_view_name.
415
+ - get_refresh_history(): verbose.
416
+ - retrieve_feature_values(): spine_timestamp_col, exclude_columns, include_feature_view_timestamp_col.
417
+ - generate_training_set(): save_as, spine_timestamp_col, spine_label_cols, exclude_columns,
418
+ include_feature_view_timestamp_col.
419
+ - generate_dataset(): version, spine_timestamp_col, spine_label_cols, exclude_columns,
420
+ include_feature_view_timestamp_col, desc, output_type.
421
+
422
+ ## 1.5.4 (2024-07-11)
423
+
424
+ ### Bug Fixes
425
+
426
+ - Model Registry (PrPr): Fix 401 Unauthorized issue when deploying model to SPCS.
427
+ - Feature Store: Downgrades exceptions to warnings for few property setters in feature view. Now you can set
428
+ desc, refresh_freq and warehouse for draft feature views.
429
+ - Modeling: Fix an issue with calling `OrdinalEncoder` with `categories` as a dictionary and a pandas DataFrame
430
+ - Modeling: Fix an issue with calling `OneHotEncoder` with `categories` as a dictionary and a pandas DataFrame
431
+
432
+ ### New Features
433
+
434
+ - Registry: Allow overriding `device_map` and `device` when loading huggingface pipeline models.
435
+ - Registry: Add `set_alias` method to `ModelVersion` instance to set an alias to model version.
436
+ - Registry: Add `unset_alias` method to `ModelVersion` instance to unset an alias to model version.
437
+ - Registry: Add `partitioned_inference_api` allowing users to create partitioned inference functions in registered
438
+ models. Enable model inference methods with table functions with vectorized process methods in registered models.
439
+ - Feature Store: add 3 more columns: refresh_freq, refresh_mode and scheduling_state to the result of
440
+ `list_feature_views()`.
441
+ - Feature Store: `update_feature_view()` supports updating description.
442
+ - Feature Store: add new API `refresh_feature_view()`.
443
+ - Feature Store: add new API `get_refresh_history()`.
444
+ - Feature Store: Add `generate_training_set()` API for generating table-backed feature snapshots.
445
+ - Feature Store: Add `DeprecationWarning` for `generate_dataset(..., output_type="table")`.
446
+ - Feature Store: `update_feature_view()` supports updating description.
447
+ - Feature Store: add new API `refresh_feature_view()`.
448
+ - Feature Store: add new API `get_refresh_history()`.
449
+ - Model Development: OrdinalEncoder supports a list of array-likes for `categories` argument.
450
+ - Model Development: OneHotEncoder supports a list of array-likes for `categories` argument.
451
+
452
+ ## 1.5.3 (06-17-2024)
377
453
 
378
454
  ### Bug Fixes
379
455
 
@@ -382,8 +458,6 @@ be compatibility issues. Server-side functionality that `snowflake-ml-python` de
382
458
  - Registry: Fix an issue that leads to incorrect result when using pandas Dataframe with over 100, 000 rows as the input
383
459
  of `ModelVersion.run` method in Stored Procedure.
384
460
 
385
- ### Behavior Changes
386
-
387
461
  ### New Features
388
462
 
389
463
  - Registry: Add support for TIMESTAMP_NTZ model signature data type, allowing timestamp input and output.