snowflake-ml-python 1.0.2__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +29 -7
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/uri.py +7 -2
  5. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  6. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  7. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  8. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  9. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  10. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  11. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  12. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  13. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  14. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  15. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  16. snowflake/ml/model/_deploy_client/warehouse/deploy.py +24 -6
  17. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +5 -2
  18. snowflake/ml/model/_deployer.py +14 -27
  19. snowflake/ml/model/_env.py +4 -4
  20. snowflake/ml/model/_handlers/custom.py +14 -2
  21. snowflake/ml/model/_handlers/pytorch.py +186 -0
  22. snowflake/ml/model/_handlers/sklearn.py +14 -9
  23. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  24. snowflake/ml/model/_handlers/torchscript.py +180 -0
  25. snowflake/ml/model/_handlers/xgboost.py +19 -9
  26. snowflake/ml/model/_model.py +3 -2
  27. snowflake/ml/model/_model_meta.py +12 -7
  28. snowflake/ml/model/model_signature.py +446 -66
  29. snowflake/ml/model/type_hints.py +23 -4
  30. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +51 -26
  31. snowflake/ml/modeling/cluster/affinity_propagation.py +51 -26
  32. snowflake/ml/modeling/cluster/agglomerative_clustering.py +51 -26
  33. snowflake/ml/modeling/cluster/birch.py +51 -26
  34. snowflake/ml/modeling/cluster/bisecting_k_means.py +51 -26
  35. snowflake/ml/modeling/cluster/dbscan.py +51 -26
  36. snowflake/ml/modeling/cluster/feature_agglomeration.py +51 -26
  37. snowflake/ml/modeling/cluster/k_means.py +51 -26
  38. snowflake/ml/modeling/cluster/mean_shift.py +51 -26
  39. snowflake/ml/modeling/cluster/mini_batch_k_means.py +51 -26
  40. snowflake/ml/modeling/cluster/optics.py +51 -26
  41. snowflake/ml/modeling/cluster/spectral_biclustering.py +51 -26
  42. snowflake/ml/modeling/cluster/spectral_clustering.py +51 -26
  43. snowflake/ml/modeling/cluster/spectral_coclustering.py +51 -26
  44. snowflake/ml/modeling/compose/column_transformer.py +51 -26
  45. snowflake/ml/modeling/compose/transformed_target_regressor.py +51 -26
  46. snowflake/ml/modeling/covariance/elliptic_envelope.py +51 -26
  47. snowflake/ml/modeling/covariance/empirical_covariance.py +51 -26
  48. snowflake/ml/modeling/covariance/graphical_lasso.py +51 -26
  49. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +51 -26
  50. snowflake/ml/modeling/covariance/ledoit_wolf.py +51 -26
  51. snowflake/ml/modeling/covariance/min_cov_det.py +51 -26
  52. snowflake/ml/modeling/covariance/oas.py +51 -26
  53. snowflake/ml/modeling/covariance/shrunk_covariance.py +51 -26
  54. snowflake/ml/modeling/decomposition/dictionary_learning.py +51 -26
  55. snowflake/ml/modeling/decomposition/factor_analysis.py +51 -26
  56. snowflake/ml/modeling/decomposition/fast_ica.py +51 -26
  57. snowflake/ml/modeling/decomposition/incremental_pca.py +51 -26
  58. snowflake/ml/modeling/decomposition/kernel_pca.py +51 -26
  59. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +51 -26
  60. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +51 -26
  61. snowflake/ml/modeling/decomposition/pca.py +51 -26
  62. snowflake/ml/modeling/decomposition/sparse_pca.py +51 -26
  63. snowflake/ml/modeling/decomposition/truncated_svd.py +51 -26
  64. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +51 -26
  65. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +51 -26
  66. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +51 -26
  67. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +51 -26
  68. snowflake/ml/modeling/ensemble/bagging_classifier.py +51 -26
  69. snowflake/ml/modeling/ensemble/bagging_regressor.py +51 -26
  70. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +51 -26
  71. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +51 -26
  72. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +51 -26
  73. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +51 -26
  74. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +51 -26
  75. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +51 -26
  76. snowflake/ml/modeling/ensemble/isolation_forest.py +51 -26
  77. snowflake/ml/modeling/ensemble/random_forest_classifier.py +51 -26
  78. snowflake/ml/modeling/ensemble/random_forest_regressor.py +51 -26
  79. snowflake/ml/modeling/ensemble/stacking_regressor.py +51 -26
  80. snowflake/ml/modeling/ensemble/voting_classifier.py +51 -26
  81. snowflake/ml/modeling/ensemble/voting_regressor.py +51 -26
  82. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +51 -26
  83. snowflake/ml/modeling/feature_selection/select_fdr.py +51 -26
  84. snowflake/ml/modeling/feature_selection/select_fpr.py +51 -26
  85. snowflake/ml/modeling/feature_selection/select_fwe.py +51 -26
  86. snowflake/ml/modeling/feature_selection/select_k_best.py +51 -26
  87. snowflake/ml/modeling/feature_selection/select_percentile.py +51 -26
  88. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +51 -26
  89. snowflake/ml/modeling/feature_selection/variance_threshold.py +51 -26
  90. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +51 -26
  91. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +51 -26
  92. snowflake/ml/modeling/impute/iterative_imputer.py +51 -26
  93. snowflake/ml/modeling/impute/knn_imputer.py +51 -26
  94. snowflake/ml/modeling/impute/missing_indicator.py +51 -26
  95. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +51 -26
  96. snowflake/ml/modeling/kernel_approximation/nystroem.py +51 -26
  97. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +51 -26
  98. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +51 -26
  99. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +51 -26
  100. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +51 -26
  101. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +51 -26
  102. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +51 -26
  103. snowflake/ml/modeling/linear_model/ard_regression.py +51 -26
  104. snowflake/ml/modeling/linear_model/bayesian_ridge.py +51 -26
  105. snowflake/ml/modeling/linear_model/elastic_net.py +51 -26
  106. snowflake/ml/modeling/linear_model/elastic_net_cv.py +51 -26
  107. snowflake/ml/modeling/linear_model/gamma_regressor.py +51 -26
  108. snowflake/ml/modeling/linear_model/huber_regressor.py +51 -26
  109. snowflake/ml/modeling/linear_model/lars.py +51 -26
  110. snowflake/ml/modeling/linear_model/lars_cv.py +51 -26
  111. snowflake/ml/modeling/linear_model/lasso.py +51 -26
  112. snowflake/ml/modeling/linear_model/lasso_cv.py +51 -26
  113. snowflake/ml/modeling/linear_model/lasso_lars.py +51 -26
  114. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +51 -26
  115. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +51 -26
  116. snowflake/ml/modeling/linear_model/linear_regression.py +51 -26
  117. snowflake/ml/modeling/linear_model/logistic_regression.py +51 -26
  118. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +51 -26
  119. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +51 -26
  120. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +51 -26
  121. snowflake/ml/modeling/linear_model/multi_task_lasso.py +51 -26
  122. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +51 -26
  123. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +51 -26
  124. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +51 -26
  125. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +51 -26
  126. snowflake/ml/modeling/linear_model/perceptron.py +51 -26
  127. snowflake/ml/modeling/linear_model/poisson_regressor.py +51 -26
  128. snowflake/ml/modeling/linear_model/ransac_regressor.py +51 -26
  129. snowflake/ml/modeling/linear_model/ridge.py +51 -26
  130. snowflake/ml/modeling/linear_model/ridge_classifier.py +51 -26
  131. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +51 -26
  132. snowflake/ml/modeling/linear_model/ridge_cv.py +51 -26
  133. snowflake/ml/modeling/linear_model/sgd_classifier.py +51 -26
  134. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +51 -26
  135. snowflake/ml/modeling/linear_model/sgd_regressor.py +51 -26
  136. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +51 -26
  137. snowflake/ml/modeling/linear_model/tweedie_regressor.py +51 -26
  138. snowflake/ml/modeling/manifold/isomap.py +51 -26
  139. snowflake/ml/modeling/manifold/mds.py +51 -26
  140. snowflake/ml/modeling/manifold/spectral_embedding.py +51 -26
  141. snowflake/ml/modeling/manifold/tsne.py +51 -26
  142. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +51 -26
  143. snowflake/ml/modeling/mixture/gaussian_mixture.py +51 -26
  144. snowflake/ml/modeling/model_selection/grid_search_cv.py +51 -26
  145. snowflake/ml/modeling/model_selection/randomized_search_cv.py +51 -26
  146. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +51 -26
  147. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +51 -26
  148. snowflake/ml/modeling/multiclass/output_code_classifier.py +51 -26
  149. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +51 -26
  150. snowflake/ml/modeling/naive_bayes/categorical_nb.py +51 -26
  151. snowflake/ml/modeling/naive_bayes/complement_nb.py +51 -26
  152. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +51 -26
  153. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +51 -26
  154. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +51 -26
  155. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +51 -26
  156. snowflake/ml/modeling/neighbors/kernel_density.py +51 -26
  157. snowflake/ml/modeling/neighbors/local_outlier_factor.py +51 -26
  158. snowflake/ml/modeling/neighbors/nearest_centroid.py +51 -26
  159. snowflake/ml/modeling/neighbors/nearest_neighbors.py +51 -26
  160. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +51 -26
  161. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +51 -26
  162. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +51 -26
  163. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +51 -26
  164. snowflake/ml/modeling/neural_network/mlp_classifier.py +51 -26
  165. snowflake/ml/modeling/neural_network/mlp_regressor.py +51 -26
  166. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  167. snowflake/ml/modeling/preprocessing/polynomial_features.py +51 -26
  168. snowflake/ml/modeling/semi_supervised/label_propagation.py +51 -26
  169. snowflake/ml/modeling/semi_supervised/label_spreading.py +51 -26
  170. snowflake/ml/modeling/svm/linear_svc.py +51 -26
  171. snowflake/ml/modeling/svm/linear_svr.py +51 -26
  172. snowflake/ml/modeling/svm/nu_svc.py +51 -26
  173. snowflake/ml/modeling/svm/nu_svr.py +51 -26
  174. snowflake/ml/modeling/svm/svc.py +51 -26
  175. snowflake/ml/modeling/svm/svr.py +51 -26
  176. snowflake/ml/modeling/tree/decision_tree_classifier.py +51 -26
  177. snowflake/ml/modeling/tree/decision_tree_regressor.py +51 -26
  178. snowflake/ml/modeling/tree/extra_tree_classifier.py +51 -26
  179. snowflake/ml/modeling/tree/extra_tree_regressor.py +51 -26
  180. snowflake/ml/modeling/xgboost/xgb_classifier.py +51 -26
  181. snowflake/ml/modeling/xgboost/xgb_regressor.py +51 -26
  182. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +51 -26
  183. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +51 -26
  184. snowflake/ml/registry/model_registry.py +74 -56
  185. snowflake/ml/version.py +1 -1
  186. {snowflake_ml_python-1.0.2.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +27 -8
  187. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  188. snowflake_ml_python-1.0.2.dist-info/RECORD +0 -246
  189. {snowflake_ml_python-1.0.2.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -262,7 +264,6 @@ class FeatureAgglomeration(BaseTransformer):
262
264
  sample_weight_col: Optional[str] = None,
263
265
  ) -> None:
264
266
  super().__init__()
265
- self.id = str(uuid4()).replace("-", "_").upper()
266
267
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
267
268
 
268
269
  self._deps = list(deps)
@@ -291,6 +292,15 @@ class FeatureAgglomeration(BaseTransformer):
291
292
  self.set_drop_input_cols(drop_input_cols)
292
293
  self.set_sample_weight_col(sample_weight_col)
293
294
 
295
+ def _get_rand_id(self) -> str:
296
+ """
297
+ Generate random id to be used in sproc and stage names.
298
+
299
+ Returns:
300
+ Random id string usable in sproc, table, and stage names.
301
+ """
302
+ return str(uuid4()).replace("-", "_").upper()
303
+
294
304
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
295
305
  """
296
306
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -369,7 +379,7 @@ class FeatureAgglomeration(BaseTransformer):
369
379
  cp.dump(self._sklearn_object, local_transform_file)
370
380
 
371
381
  # Create temp stage to run fit.
372
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
382
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
373
383
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
374
384
  SqlResultValidator(
375
385
  session=session,
@@ -382,11 +392,12 @@ class FeatureAgglomeration(BaseTransformer):
382
392
  expected_value=f"Stage area {transform_stage_name} successfully created."
383
393
  ).validate()
384
394
 
385
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
395
+ # Use posixpath to construct stage paths
396
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
397
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
386
398
  local_result_file_name = get_temp_file_path()
387
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
388
399
 
389
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
400
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
390
401
  statement_params = telemetry.get_function_usage_statement_params(
391
402
  project=_PROJECT,
392
403
  subproject=_SUBPROJECT,
@@ -412,6 +423,7 @@ class FeatureAgglomeration(BaseTransformer):
412
423
  replace=True,
413
424
  session=session,
414
425
  statement_params=statement_params,
426
+ anonymous=True
415
427
  )
416
428
  def fit_wrapper_sproc(
417
429
  session: Session,
@@ -420,7 +432,8 @@ class FeatureAgglomeration(BaseTransformer):
420
432
  stage_result_file_name: str,
421
433
  input_cols: List[str],
422
434
  label_cols: List[str],
423
- sample_weight_col: Optional[str]
435
+ sample_weight_col: Optional[str],
436
+ statement_params: Dict[str, str]
424
437
  ) -> str:
425
438
  import cloudpickle as cp
426
439
  import numpy as np
@@ -487,15 +500,15 @@ class FeatureAgglomeration(BaseTransformer):
487
500
  api_calls=[Session.call],
488
501
  custom_tags=dict([("autogen", True)]),
489
502
  )
490
- sproc_export_file_name = session.call(
491
- fit_sproc_name,
503
+ sproc_export_file_name = fit_wrapper_sproc(
504
+ session,
492
505
  query,
493
506
  stage_transform_file_name,
494
507
  stage_result_file_name,
495
508
  identifier.get_unescaped_names(self.input_cols),
496
509
  identifier.get_unescaped_names(self.label_cols),
497
510
  identifier.get_unescaped_names(self.sample_weight_col),
498
- statement_params=statement_params,
511
+ statement_params,
499
512
  )
500
513
 
501
514
  if "|" in sproc_export_file_name:
@@ -505,7 +518,7 @@ class FeatureAgglomeration(BaseTransformer):
505
518
  print("\n".join(fields[1:]))
506
519
 
507
520
  session.file.get(
508
- os.path.join(stage_result_file_name, sproc_export_file_name),
521
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
509
522
  local_result_file_name,
510
523
  statement_params=statement_params
511
524
  )
@@ -551,7 +564,7 @@ class FeatureAgglomeration(BaseTransformer):
551
564
 
552
565
  # Register vectorized UDF for batch inference
553
566
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
554
- safe_id=self.id, method=inference_method)
567
+ safe_id=self._get_rand_id(), method=inference_method)
555
568
 
556
569
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
557
570
  # will try to pickle all of self which fails.
@@ -643,7 +656,7 @@ class FeatureAgglomeration(BaseTransformer):
643
656
  return transformed_pandas_df.to_dict("records")
644
657
 
645
658
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
646
- safe_id=self.id
659
+ safe_id=self._get_rand_id()
647
660
  )
648
661
 
649
662
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -808,11 +821,18 @@ class FeatureAgglomeration(BaseTransformer):
808
821
  Transformed dataset.
809
822
  """
810
823
  if isinstance(dataset, DataFrame):
824
+ expected_type_inferred = ""
825
+ # when it is classifier, infer the datatype from label columns
826
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
827
+ expected_type_inferred = convert_sp_to_sf_type(
828
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
829
+ )
830
+
811
831
  output_df = self._batch_inference(
812
832
  dataset=dataset,
813
833
  inference_method="predict",
814
834
  expected_output_cols_list=self.output_cols,
815
- expected_output_cols_type="",
835
+ expected_output_cols_type=expected_type_inferred,
816
836
  )
817
837
  elif isinstance(dataset, pd.DataFrame):
818
838
  output_df = self._sklearn_inference(
@@ -885,10 +905,10 @@ class FeatureAgglomeration(BaseTransformer):
885
905
 
886
906
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
887
907
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
888
- Returns an empty list if current object is not a classifier or not yet fitted.
908
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
889
909
  """
890
910
  if getattr(self._sklearn_object, "classes_", None) is None:
891
- return []
911
+ return [output_cols_prefix]
892
912
 
893
913
  classes = self._sklearn_object.classes_
894
914
  if isinstance(classes, numpy.ndarray):
@@ -1113,7 +1133,7 @@ class FeatureAgglomeration(BaseTransformer):
1113
1133
  cp.dump(self._sklearn_object, local_score_file)
1114
1134
 
1115
1135
  # Create temp stage to run score.
1116
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1136
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1117
1137
  session = dataset._session
1118
1138
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1119
1139
  SqlResultValidator(
@@ -1127,8 +1147,9 @@ class FeatureAgglomeration(BaseTransformer):
1127
1147
  expected_value=f"Stage area {score_stage_name} successfully created."
1128
1148
  ).validate()
1129
1149
 
1130
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1131
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1150
+ # Use posixpath to construct stage paths
1151
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1152
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1132
1153
  statement_params = telemetry.get_function_usage_statement_params(
1133
1154
  project=_PROJECT,
1134
1155
  subproject=_SUBPROJECT,
@@ -1154,6 +1175,7 @@ class FeatureAgglomeration(BaseTransformer):
1154
1175
  replace=True,
1155
1176
  session=session,
1156
1177
  statement_params=statement_params,
1178
+ anonymous=True
1157
1179
  )
1158
1180
  def score_wrapper_sproc(
1159
1181
  session: Session,
@@ -1161,7 +1183,8 @@ class FeatureAgglomeration(BaseTransformer):
1161
1183
  stage_score_file_name: str,
1162
1184
  input_cols: List[str],
1163
1185
  label_cols: List[str],
1164
- sample_weight_col: Optional[str]
1186
+ sample_weight_col: Optional[str],
1187
+ statement_params: Dict[str, str]
1165
1188
  ) -> float:
1166
1189
  import cloudpickle as cp
1167
1190
  import numpy as np
@@ -1211,14 +1234,14 @@ class FeatureAgglomeration(BaseTransformer):
1211
1234
  api_calls=[Session.call],
1212
1235
  custom_tags=dict([("autogen", True)]),
1213
1236
  )
1214
- score = session.call(
1215
- score_sproc_name,
1237
+ score = score_wrapper_sproc(
1238
+ session,
1216
1239
  query,
1217
1240
  stage_score_file_name,
1218
1241
  identifier.get_unescaped_names(self.input_cols),
1219
1242
  identifier.get_unescaped_names(self.label_cols),
1220
1243
  identifier.get_unescaped_names(self.sample_weight_col),
1221
- statement_params=statement_params,
1244
+ statement_params,
1222
1245
  )
1223
1246
 
1224
1247
  cleanup_temp_files([local_score_file_name])
@@ -1236,18 +1259,20 @@ class FeatureAgglomeration(BaseTransformer):
1236
1259
  if self._sklearn_object._estimator_type == 'classifier':
1237
1260
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1238
1261
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1239
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1262
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1263
+ ([] if self._drop_input_cols else inputs) + outputs)
1240
1264
  # For regressor, the type of predict is float64
1241
1265
  elif self._sklearn_object._estimator_type == 'regressor':
1242
1266
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1243
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1244
-
1267
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1268
+ ([] if self._drop_input_cols else inputs) + outputs)
1245
1269
  for prob_func in PROB_FUNCTIONS:
1246
1270
  if hasattr(self, prob_func):
1247
1271
  output_cols_prefix: str = f"{prob_func}_"
1248
1272
  output_column_names = self._get_output_column_names(output_cols_prefix)
1249
1273
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1250
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1274
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1275
+ ([] if self._drop_input_cols else inputs) + outputs)
1251
1276
 
1252
1277
  @property
1253
1278
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -258,7 +260,6 @@ class KMeans(BaseTransformer):
258
260
  sample_weight_col: Optional[str] = None,
259
261
  ) -> None:
260
262
  super().__init__()
261
- self.id = str(uuid4()).replace("-", "_").upper()
262
263
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
263
264
 
264
265
  self._deps = list(deps)
@@ -286,6 +287,15 @@ class KMeans(BaseTransformer):
286
287
  self.set_drop_input_cols(drop_input_cols)
287
288
  self.set_sample_weight_col(sample_weight_col)
288
289
 
290
+ def _get_rand_id(self) -> str:
291
+ """
292
+ Generate random id to be used in sproc and stage names.
293
+
294
+ Returns:
295
+ Random id string usable in sproc, table, and stage names.
296
+ """
297
+ return str(uuid4()).replace("-", "_").upper()
298
+
289
299
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
290
300
  """
291
301
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -364,7 +374,7 @@ class KMeans(BaseTransformer):
364
374
  cp.dump(self._sklearn_object, local_transform_file)
365
375
 
366
376
  # Create temp stage to run fit.
367
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
377
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
368
378
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
369
379
  SqlResultValidator(
370
380
  session=session,
@@ -377,11 +387,12 @@ class KMeans(BaseTransformer):
377
387
  expected_value=f"Stage area {transform_stage_name} successfully created."
378
388
  ).validate()
379
389
 
380
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
390
+ # Use posixpath to construct stage paths
391
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
392
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
381
393
  local_result_file_name = get_temp_file_path()
382
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
383
394
 
384
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
395
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
385
396
  statement_params = telemetry.get_function_usage_statement_params(
386
397
  project=_PROJECT,
387
398
  subproject=_SUBPROJECT,
@@ -407,6 +418,7 @@ class KMeans(BaseTransformer):
407
418
  replace=True,
408
419
  session=session,
409
420
  statement_params=statement_params,
421
+ anonymous=True
410
422
  )
411
423
  def fit_wrapper_sproc(
412
424
  session: Session,
@@ -415,7 +427,8 @@ class KMeans(BaseTransformer):
415
427
  stage_result_file_name: str,
416
428
  input_cols: List[str],
417
429
  label_cols: List[str],
418
- sample_weight_col: Optional[str]
430
+ sample_weight_col: Optional[str],
431
+ statement_params: Dict[str, str]
419
432
  ) -> str:
420
433
  import cloudpickle as cp
421
434
  import numpy as np
@@ -482,15 +495,15 @@ class KMeans(BaseTransformer):
482
495
  api_calls=[Session.call],
483
496
  custom_tags=dict([("autogen", True)]),
484
497
  )
485
- sproc_export_file_name = session.call(
486
- fit_sproc_name,
498
+ sproc_export_file_name = fit_wrapper_sproc(
499
+ session,
487
500
  query,
488
501
  stage_transform_file_name,
489
502
  stage_result_file_name,
490
503
  identifier.get_unescaped_names(self.input_cols),
491
504
  identifier.get_unescaped_names(self.label_cols),
492
505
  identifier.get_unescaped_names(self.sample_weight_col),
493
- statement_params=statement_params,
506
+ statement_params,
494
507
  )
495
508
 
496
509
  if "|" in sproc_export_file_name:
@@ -500,7 +513,7 @@ class KMeans(BaseTransformer):
500
513
  print("\n".join(fields[1:]))
501
514
 
502
515
  session.file.get(
503
- os.path.join(stage_result_file_name, sproc_export_file_name),
516
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
504
517
  local_result_file_name,
505
518
  statement_params=statement_params
506
519
  )
@@ -546,7 +559,7 @@ class KMeans(BaseTransformer):
546
559
 
547
560
  # Register vectorized UDF for batch inference
548
561
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
549
- safe_id=self.id, method=inference_method)
562
+ safe_id=self._get_rand_id(), method=inference_method)
550
563
 
551
564
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
552
565
  # will try to pickle all of self which fails.
@@ -638,7 +651,7 @@ class KMeans(BaseTransformer):
638
651
  return transformed_pandas_df.to_dict("records")
639
652
 
640
653
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
641
- safe_id=self.id
654
+ safe_id=self._get_rand_id()
642
655
  )
643
656
 
644
657
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -805,11 +818,18 @@ class KMeans(BaseTransformer):
805
818
  Transformed dataset.
806
819
  """
807
820
  if isinstance(dataset, DataFrame):
821
+ expected_type_inferred = ""
822
+ # when it is classifier, infer the datatype from label columns
823
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
824
+ expected_type_inferred = convert_sp_to_sf_type(
825
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
826
+ )
827
+
808
828
  output_df = self._batch_inference(
809
829
  dataset=dataset,
810
830
  inference_method="predict",
811
831
  expected_output_cols_list=self.output_cols,
812
- expected_output_cols_type="",
832
+ expected_output_cols_type=expected_type_inferred,
813
833
  )
814
834
  elif isinstance(dataset, pd.DataFrame):
815
835
  output_df = self._sklearn_inference(
@@ -882,10 +902,10 @@ class KMeans(BaseTransformer):
882
902
 
883
903
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
884
904
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
885
- Returns an empty list if current object is not a classifier or not yet fitted.
905
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
886
906
  """
887
907
  if getattr(self._sklearn_object, "classes_", None) is None:
888
- return []
908
+ return [output_cols_prefix]
889
909
 
890
910
  classes = self._sklearn_object.classes_
891
911
  if isinstance(classes, numpy.ndarray):
@@ -1110,7 +1130,7 @@ class KMeans(BaseTransformer):
1110
1130
  cp.dump(self._sklearn_object, local_score_file)
1111
1131
 
1112
1132
  # Create temp stage to run score.
1113
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1133
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1114
1134
  session = dataset._session
1115
1135
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1116
1136
  SqlResultValidator(
@@ -1124,8 +1144,9 @@ class KMeans(BaseTransformer):
1124
1144
  expected_value=f"Stage area {score_stage_name} successfully created."
1125
1145
  ).validate()
1126
1146
 
1127
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1128
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1147
+ # Use posixpath to construct stage paths
1148
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1149
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1129
1150
  statement_params = telemetry.get_function_usage_statement_params(
1130
1151
  project=_PROJECT,
1131
1152
  subproject=_SUBPROJECT,
@@ -1151,6 +1172,7 @@ class KMeans(BaseTransformer):
1151
1172
  replace=True,
1152
1173
  session=session,
1153
1174
  statement_params=statement_params,
1175
+ anonymous=True
1154
1176
  )
1155
1177
  def score_wrapper_sproc(
1156
1178
  session: Session,
@@ -1158,7 +1180,8 @@ class KMeans(BaseTransformer):
1158
1180
  stage_score_file_name: str,
1159
1181
  input_cols: List[str],
1160
1182
  label_cols: List[str],
1161
- sample_weight_col: Optional[str]
1183
+ sample_weight_col: Optional[str],
1184
+ statement_params: Dict[str, str]
1162
1185
  ) -> float:
1163
1186
  import cloudpickle as cp
1164
1187
  import numpy as np
@@ -1208,14 +1231,14 @@ class KMeans(BaseTransformer):
1208
1231
  api_calls=[Session.call],
1209
1232
  custom_tags=dict([("autogen", True)]),
1210
1233
  )
1211
- score = session.call(
1212
- score_sproc_name,
1234
+ score = score_wrapper_sproc(
1235
+ session,
1213
1236
  query,
1214
1237
  stage_score_file_name,
1215
1238
  identifier.get_unescaped_names(self.input_cols),
1216
1239
  identifier.get_unescaped_names(self.label_cols),
1217
1240
  identifier.get_unescaped_names(self.sample_weight_col),
1218
- statement_params=statement_params,
1241
+ statement_params,
1219
1242
  )
1220
1243
 
1221
1244
  cleanup_temp_files([local_score_file_name])
@@ -1233,18 +1256,20 @@ class KMeans(BaseTransformer):
1233
1256
  if self._sklearn_object._estimator_type == 'classifier':
1234
1257
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1235
1258
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1236
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1259
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1260
+ ([] if self._drop_input_cols else inputs) + outputs)
1237
1261
  # For regressor, the type of predict is float64
1238
1262
  elif self._sklearn_object._estimator_type == 'regressor':
1239
1263
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1240
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1241
-
1264
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1265
+ ([] if self._drop_input_cols else inputs) + outputs)
1242
1266
  for prob_func in PROB_FUNCTIONS:
1243
1267
  if hasattr(self, prob_func):
1244
1268
  output_cols_prefix: str = f"{prob_func}_"
1245
1269
  output_column_names = self._get_output_column_names(output_cols_prefix)
1246
1270
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1247
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1271
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1272
+ ([] if self._drop_input_cols else inputs) + outputs)
1248
1273
 
1249
1274
  @property
1250
1275
  def model_signatures(self) -> Dict[str, ModelSignature]: