snowflake-ml-python 1.0.2__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +29 -7
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/uri.py +7 -2
  5. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  6. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  7. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  8. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  9. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  10. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  11. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  12. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  13. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  14. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  15. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  16. snowflake/ml/model/_deploy_client/warehouse/deploy.py +24 -6
  17. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +5 -2
  18. snowflake/ml/model/_deployer.py +14 -27
  19. snowflake/ml/model/_env.py +4 -4
  20. snowflake/ml/model/_handlers/custom.py +14 -2
  21. snowflake/ml/model/_handlers/pytorch.py +186 -0
  22. snowflake/ml/model/_handlers/sklearn.py +14 -9
  23. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  24. snowflake/ml/model/_handlers/torchscript.py +180 -0
  25. snowflake/ml/model/_handlers/xgboost.py +19 -9
  26. snowflake/ml/model/_model.py +3 -2
  27. snowflake/ml/model/_model_meta.py +12 -7
  28. snowflake/ml/model/model_signature.py +446 -66
  29. snowflake/ml/model/type_hints.py +23 -4
  30. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +51 -26
  31. snowflake/ml/modeling/cluster/affinity_propagation.py +51 -26
  32. snowflake/ml/modeling/cluster/agglomerative_clustering.py +51 -26
  33. snowflake/ml/modeling/cluster/birch.py +51 -26
  34. snowflake/ml/modeling/cluster/bisecting_k_means.py +51 -26
  35. snowflake/ml/modeling/cluster/dbscan.py +51 -26
  36. snowflake/ml/modeling/cluster/feature_agglomeration.py +51 -26
  37. snowflake/ml/modeling/cluster/k_means.py +51 -26
  38. snowflake/ml/modeling/cluster/mean_shift.py +51 -26
  39. snowflake/ml/modeling/cluster/mini_batch_k_means.py +51 -26
  40. snowflake/ml/modeling/cluster/optics.py +51 -26
  41. snowflake/ml/modeling/cluster/spectral_biclustering.py +51 -26
  42. snowflake/ml/modeling/cluster/spectral_clustering.py +51 -26
  43. snowflake/ml/modeling/cluster/spectral_coclustering.py +51 -26
  44. snowflake/ml/modeling/compose/column_transformer.py +51 -26
  45. snowflake/ml/modeling/compose/transformed_target_regressor.py +51 -26
  46. snowflake/ml/modeling/covariance/elliptic_envelope.py +51 -26
  47. snowflake/ml/modeling/covariance/empirical_covariance.py +51 -26
  48. snowflake/ml/modeling/covariance/graphical_lasso.py +51 -26
  49. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +51 -26
  50. snowflake/ml/modeling/covariance/ledoit_wolf.py +51 -26
  51. snowflake/ml/modeling/covariance/min_cov_det.py +51 -26
  52. snowflake/ml/modeling/covariance/oas.py +51 -26
  53. snowflake/ml/modeling/covariance/shrunk_covariance.py +51 -26
  54. snowflake/ml/modeling/decomposition/dictionary_learning.py +51 -26
  55. snowflake/ml/modeling/decomposition/factor_analysis.py +51 -26
  56. snowflake/ml/modeling/decomposition/fast_ica.py +51 -26
  57. snowflake/ml/modeling/decomposition/incremental_pca.py +51 -26
  58. snowflake/ml/modeling/decomposition/kernel_pca.py +51 -26
  59. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +51 -26
  60. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +51 -26
  61. snowflake/ml/modeling/decomposition/pca.py +51 -26
  62. snowflake/ml/modeling/decomposition/sparse_pca.py +51 -26
  63. snowflake/ml/modeling/decomposition/truncated_svd.py +51 -26
  64. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +51 -26
  65. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +51 -26
  66. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +51 -26
  67. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +51 -26
  68. snowflake/ml/modeling/ensemble/bagging_classifier.py +51 -26
  69. snowflake/ml/modeling/ensemble/bagging_regressor.py +51 -26
  70. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +51 -26
  71. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +51 -26
  72. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +51 -26
  73. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +51 -26
  74. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +51 -26
  75. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +51 -26
  76. snowflake/ml/modeling/ensemble/isolation_forest.py +51 -26
  77. snowflake/ml/modeling/ensemble/random_forest_classifier.py +51 -26
  78. snowflake/ml/modeling/ensemble/random_forest_regressor.py +51 -26
  79. snowflake/ml/modeling/ensemble/stacking_regressor.py +51 -26
  80. snowflake/ml/modeling/ensemble/voting_classifier.py +51 -26
  81. snowflake/ml/modeling/ensemble/voting_regressor.py +51 -26
  82. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +51 -26
  83. snowflake/ml/modeling/feature_selection/select_fdr.py +51 -26
  84. snowflake/ml/modeling/feature_selection/select_fpr.py +51 -26
  85. snowflake/ml/modeling/feature_selection/select_fwe.py +51 -26
  86. snowflake/ml/modeling/feature_selection/select_k_best.py +51 -26
  87. snowflake/ml/modeling/feature_selection/select_percentile.py +51 -26
  88. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +51 -26
  89. snowflake/ml/modeling/feature_selection/variance_threshold.py +51 -26
  90. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +51 -26
  91. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +51 -26
  92. snowflake/ml/modeling/impute/iterative_imputer.py +51 -26
  93. snowflake/ml/modeling/impute/knn_imputer.py +51 -26
  94. snowflake/ml/modeling/impute/missing_indicator.py +51 -26
  95. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +51 -26
  96. snowflake/ml/modeling/kernel_approximation/nystroem.py +51 -26
  97. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +51 -26
  98. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +51 -26
  99. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +51 -26
  100. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +51 -26
  101. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +51 -26
  102. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +51 -26
  103. snowflake/ml/modeling/linear_model/ard_regression.py +51 -26
  104. snowflake/ml/modeling/linear_model/bayesian_ridge.py +51 -26
  105. snowflake/ml/modeling/linear_model/elastic_net.py +51 -26
  106. snowflake/ml/modeling/linear_model/elastic_net_cv.py +51 -26
  107. snowflake/ml/modeling/linear_model/gamma_regressor.py +51 -26
  108. snowflake/ml/modeling/linear_model/huber_regressor.py +51 -26
  109. snowflake/ml/modeling/linear_model/lars.py +51 -26
  110. snowflake/ml/modeling/linear_model/lars_cv.py +51 -26
  111. snowflake/ml/modeling/linear_model/lasso.py +51 -26
  112. snowflake/ml/modeling/linear_model/lasso_cv.py +51 -26
  113. snowflake/ml/modeling/linear_model/lasso_lars.py +51 -26
  114. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +51 -26
  115. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +51 -26
  116. snowflake/ml/modeling/linear_model/linear_regression.py +51 -26
  117. snowflake/ml/modeling/linear_model/logistic_regression.py +51 -26
  118. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +51 -26
  119. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +51 -26
  120. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +51 -26
  121. snowflake/ml/modeling/linear_model/multi_task_lasso.py +51 -26
  122. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +51 -26
  123. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +51 -26
  124. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +51 -26
  125. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +51 -26
  126. snowflake/ml/modeling/linear_model/perceptron.py +51 -26
  127. snowflake/ml/modeling/linear_model/poisson_regressor.py +51 -26
  128. snowflake/ml/modeling/linear_model/ransac_regressor.py +51 -26
  129. snowflake/ml/modeling/linear_model/ridge.py +51 -26
  130. snowflake/ml/modeling/linear_model/ridge_classifier.py +51 -26
  131. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +51 -26
  132. snowflake/ml/modeling/linear_model/ridge_cv.py +51 -26
  133. snowflake/ml/modeling/linear_model/sgd_classifier.py +51 -26
  134. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +51 -26
  135. snowflake/ml/modeling/linear_model/sgd_regressor.py +51 -26
  136. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +51 -26
  137. snowflake/ml/modeling/linear_model/tweedie_regressor.py +51 -26
  138. snowflake/ml/modeling/manifold/isomap.py +51 -26
  139. snowflake/ml/modeling/manifold/mds.py +51 -26
  140. snowflake/ml/modeling/manifold/spectral_embedding.py +51 -26
  141. snowflake/ml/modeling/manifold/tsne.py +51 -26
  142. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +51 -26
  143. snowflake/ml/modeling/mixture/gaussian_mixture.py +51 -26
  144. snowflake/ml/modeling/model_selection/grid_search_cv.py +51 -26
  145. snowflake/ml/modeling/model_selection/randomized_search_cv.py +51 -26
  146. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +51 -26
  147. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +51 -26
  148. snowflake/ml/modeling/multiclass/output_code_classifier.py +51 -26
  149. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +51 -26
  150. snowflake/ml/modeling/naive_bayes/categorical_nb.py +51 -26
  151. snowflake/ml/modeling/naive_bayes/complement_nb.py +51 -26
  152. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +51 -26
  153. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +51 -26
  154. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +51 -26
  155. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +51 -26
  156. snowflake/ml/modeling/neighbors/kernel_density.py +51 -26
  157. snowflake/ml/modeling/neighbors/local_outlier_factor.py +51 -26
  158. snowflake/ml/modeling/neighbors/nearest_centroid.py +51 -26
  159. snowflake/ml/modeling/neighbors/nearest_neighbors.py +51 -26
  160. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +51 -26
  161. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +51 -26
  162. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +51 -26
  163. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +51 -26
  164. snowflake/ml/modeling/neural_network/mlp_classifier.py +51 -26
  165. snowflake/ml/modeling/neural_network/mlp_regressor.py +51 -26
  166. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  167. snowflake/ml/modeling/preprocessing/polynomial_features.py +51 -26
  168. snowflake/ml/modeling/semi_supervised/label_propagation.py +51 -26
  169. snowflake/ml/modeling/semi_supervised/label_spreading.py +51 -26
  170. snowflake/ml/modeling/svm/linear_svc.py +51 -26
  171. snowflake/ml/modeling/svm/linear_svr.py +51 -26
  172. snowflake/ml/modeling/svm/nu_svc.py +51 -26
  173. snowflake/ml/modeling/svm/nu_svr.py +51 -26
  174. snowflake/ml/modeling/svm/svc.py +51 -26
  175. snowflake/ml/modeling/svm/svr.py +51 -26
  176. snowflake/ml/modeling/tree/decision_tree_classifier.py +51 -26
  177. snowflake/ml/modeling/tree/decision_tree_regressor.py +51 -26
  178. snowflake/ml/modeling/tree/extra_tree_classifier.py +51 -26
  179. snowflake/ml/modeling/tree/extra_tree_regressor.py +51 -26
  180. snowflake/ml/modeling/xgboost/xgb_classifier.py +51 -26
  181. snowflake/ml/modeling/xgboost/xgb_regressor.py +51 -26
  182. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +51 -26
  183. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +51 -26
  184. snowflake/ml/registry/model_registry.py +74 -56
  185. snowflake/ml/version.py +1 -1
  186. {snowflake_ml_python-1.0.2.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +27 -8
  187. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  188. snowflake_ml_python-1.0.2.dist-info/RECORD +0 -246
  189. {snowflake_ml_python-1.0.2.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -246,7 +248,6 @@ class SequentialFeatureSelector(BaseTransformer):
246
248
  sample_weight_col: Optional[str] = None,
247
249
  ) -> None:
248
250
  super().__init__()
249
- self.id = str(uuid4()).replace("-", "_").upper()
250
251
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
251
252
  deps = deps | _gather_dependencies(estimator)
252
253
  self._deps = list(deps)
@@ -272,6 +273,15 @@ class SequentialFeatureSelector(BaseTransformer):
272
273
  self.set_drop_input_cols(drop_input_cols)
273
274
  self.set_sample_weight_col(sample_weight_col)
274
275
 
276
+ def _get_rand_id(self) -> str:
277
+ """
278
+ Generate random id to be used in sproc and stage names.
279
+
280
+ Returns:
281
+ Random id string usable in sproc, table, and stage names.
282
+ """
283
+ return str(uuid4()).replace("-", "_").upper()
284
+
275
285
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
276
286
  """
277
287
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -350,7 +360,7 @@ class SequentialFeatureSelector(BaseTransformer):
350
360
  cp.dump(self._sklearn_object, local_transform_file)
351
361
 
352
362
  # Create temp stage to run fit.
353
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
363
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
354
364
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
355
365
  SqlResultValidator(
356
366
  session=session,
@@ -363,11 +373,12 @@ class SequentialFeatureSelector(BaseTransformer):
363
373
  expected_value=f"Stage area {transform_stage_name} successfully created."
364
374
  ).validate()
365
375
 
366
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
376
+ # Use posixpath to construct stage paths
377
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
378
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
367
379
  local_result_file_name = get_temp_file_path()
368
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
369
380
 
370
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
381
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
371
382
  statement_params = telemetry.get_function_usage_statement_params(
372
383
  project=_PROJECT,
373
384
  subproject=_SUBPROJECT,
@@ -393,6 +404,7 @@ class SequentialFeatureSelector(BaseTransformer):
393
404
  replace=True,
394
405
  session=session,
395
406
  statement_params=statement_params,
407
+ anonymous=True
396
408
  )
397
409
  def fit_wrapper_sproc(
398
410
  session: Session,
@@ -401,7 +413,8 @@ class SequentialFeatureSelector(BaseTransformer):
401
413
  stage_result_file_name: str,
402
414
  input_cols: List[str],
403
415
  label_cols: List[str],
404
- sample_weight_col: Optional[str]
416
+ sample_weight_col: Optional[str],
417
+ statement_params: Dict[str, str]
405
418
  ) -> str:
406
419
  import cloudpickle as cp
407
420
  import numpy as np
@@ -468,15 +481,15 @@ class SequentialFeatureSelector(BaseTransformer):
468
481
  api_calls=[Session.call],
469
482
  custom_tags=dict([("autogen", True)]),
470
483
  )
471
- sproc_export_file_name = session.call(
472
- fit_sproc_name,
484
+ sproc_export_file_name = fit_wrapper_sproc(
485
+ session,
473
486
  query,
474
487
  stage_transform_file_name,
475
488
  stage_result_file_name,
476
489
  identifier.get_unescaped_names(self.input_cols),
477
490
  identifier.get_unescaped_names(self.label_cols),
478
491
  identifier.get_unescaped_names(self.sample_weight_col),
479
- statement_params=statement_params,
492
+ statement_params,
480
493
  )
481
494
 
482
495
  if "|" in sproc_export_file_name:
@@ -486,7 +499,7 @@ class SequentialFeatureSelector(BaseTransformer):
486
499
  print("\n".join(fields[1:]))
487
500
 
488
501
  session.file.get(
489
- os.path.join(stage_result_file_name, sproc_export_file_name),
502
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
490
503
  local_result_file_name,
491
504
  statement_params=statement_params
492
505
  )
@@ -532,7 +545,7 @@ class SequentialFeatureSelector(BaseTransformer):
532
545
 
533
546
  # Register vectorized UDF for batch inference
534
547
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
535
- safe_id=self.id, method=inference_method)
548
+ safe_id=self._get_rand_id(), method=inference_method)
536
549
 
537
550
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
538
551
  # will try to pickle all of self which fails.
@@ -624,7 +637,7 @@ class SequentialFeatureSelector(BaseTransformer):
624
637
  return transformed_pandas_df.to_dict("records")
625
638
 
626
639
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
627
- safe_id=self.id
640
+ safe_id=self._get_rand_id()
628
641
  )
629
642
 
630
643
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -789,11 +802,18 @@ class SequentialFeatureSelector(BaseTransformer):
789
802
  Transformed dataset.
790
803
  """
791
804
  if isinstance(dataset, DataFrame):
805
+ expected_type_inferred = ""
806
+ # when it is classifier, infer the datatype from label columns
807
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
808
+ expected_type_inferred = convert_sp_to_sf_type(
809
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
810
+ )
811
+
792
812
  output_df = self._batch_inference(
793
813
  dataset=dataset,
794
814
  inference_method="predict",
795
815
  expected_output_cols_list=self.output_cols,
796
- expected_output_cols_type="",
816
+ expected_output_cols_type=expected_type_inferred,
797
817
  )
798
818
  elif isinstance(dataset, pd.DataFrame):
799
819
  output_df = self._sklearn_inference(
@@ -866,10 +886,10 @@ class SequentialFeatureSelector(BaseTransformer):
866
886
 
867
887
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
868
888
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
869
- Returns an empty list if current object is not a classifier or not yet fitted.
889
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
870
890
  """
871
891
  if getattr(self._sklearn_object, "classes_", None) is None:
872
- return []
892
+ return [output_cols_prefix]
873
893
 
874
894
  classes = self._sklearn_object.classes_
875
895
  if isinstance(classes, numpy.ndarray):
@@ -1094,7 +1114,7 @@ class SequentialFeatureSelector(BaseTransformer):
1094
1114
  cp.dump(self._sklearn_object, local_score_file)
1095
1115
 
1096
1116
  # Create temp stage to run score.
1097
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1117
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1098
1118
  session = dataset._session
1099
1119
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1100
1120
  SqlResultValidator(
@@ -1108,8 +1128,9 @@ class SequentialFeatureSelector(BaseTransformer):
1108
1128
  expected_value=f"Stage area {score_stage_name} successfully created."
1109
1129
  ).validate()
1110
1130
 
1111
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1112
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1131
+ # Use posixpath to construct stage paths
1132
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1133
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1113
1134
  statement_params = telemetry.get_function_usage_statement_params(
1114
1135
  project=_PROJECT,
1115
1136
  subproject=_SUBPROJECT,
@@ -1135,6 +1156,7 @@ class SequentialFeatureSelector(BaseTransformer):
1135
1156
  replace=True,
1136
1157
  session=session,
1137
1158
  statement_params=statement_params,
1159
+ anonymous=True
1138
1160
  )
1139
1161
  def score_wrapper_sproc(
1140
1162
  session: Session,
@@ -1142,7 +1164,8 @@ class SequentialFeatureSelector(BaseTransformer):
1142
1164
  stage_score_file_name: str,
1143
1165
  input_cols: List[str],
1144
1166
  label_cols: List[str],
1145
- sample_weight_col: Optional[str]
1167
+ sample_weight_col: Optional[str],
1168
+ statement_params: Dict[str, str]
1146
1169
  ) -> float:
1147
1170
  import cloudpickle as cp
1148
1171
  import numpy as np
@@ -1192,14 +1215,14 @@ class SequentialFeatureSelector(BaseTransformer):
1192
1215
  api_calls=[Session.call],
1193
1216
  custom_tags=dict([("autogen", True)]),
1194
1217
  )
1195
- score = session.call(
1196
- score_sproc_name,
1218
+ score = score_wrapper_sproc(
1219
+ session,
1197
1220
  query,
1198
1221
  stage_score_file_name,
1199
1222
  identifier.get_unescaped_names(self.input_cols),
1200
1223
  identifier.get_unescaped_names(self.label_cols),
1201
1224
  identifier.get_unescaped_names(self.sample_weight_col),
1202
- statement_params=statement_params,
1225
+ statement_params,
1203
1226
  )
1204
1227
 
1205
1228
  cleanup_temp_files([local_score_file_name])
@@ -1217,18 +1240,20 @@ class SequentialFeatureSelector(BaseTransformer):
1217
1240
  if self._sklearn_object._estimator_type == 'classifier':
1218
1241
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1219
1242
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1220
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1243
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1244
+ ([] if self._drop_input_cols else inputs) + outputs)
1221
1245
  # For regressor, the type of predict is float64
1222
1246
  elif self._sklearn_object._estimator_type == 'regressor':
1223
1247
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1224
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1225
-
1248
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1249
+ ([] if self._drop_input_cols else inputs) + outputs)
1226
1250
  for prob_func in PROB_FUNCTIONS:
1227
1251
  if hasattr(self, prob_func):
1228
1252
  output_cols_prefix: str = f"{prob_func}_"
1229
1253
  output_column_names = self._get_output_column_names(output_cols_prefix)
1230
1254
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1231
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1255
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1256
+ ([] if self._drop_input_cols else inputs) + outputs)
1232
1257
 
1233
1258
  @property
1234
1259
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -185,7 +187,6 @@ class VarianceThreshold(BaseTransformer):
185
187
  sample_weight_col: Optional[str] = None,
186
188
  ) -> None:
187
189
  super().__init__()
188
- self.id = str(uuid4()).replace("-", "_").upper()
189
190
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
190
191
 
191
192
  self._deps = list(deps)
@@ -205,6 +206,15 @@ class VarianceThreshold(BaseTransformer):
205
206
  self.set_drop_input_cols(drop_input_cols)
206
207
  self.set_sample_weight_col(sample_weight_col)
207
208
 
209
+ def _get_rand_id(self) -> str:
210
+ """
211
+ Generate random id to be used in sproc and stage names.
212
+
213
+ Returns:
214
+ Random id string usable in sproc, table, and stage names.
215
+ """
216
+ return str(uuid4()).replace("-", "_").upper()
217
+
208
218
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
209
219
  """
210
220
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -283,7 +293,7 @@ class VarianceThreshold(BaseTransformer):
283
293
  cp.dump(self._sklearn_object, local_transform_file)
284
294
 
285
295
  # Create temp stage to run fit.
286
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
296
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
287
297
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
288
298
  SqlResultValidator(
289
299
  session=session,
@@ -296,11 +306,12 @@ class VarianceThreshold(BaseTransformer):
296
306
  expected_value=f"Stage area {transform_stage_name} successfully created."
297
307
  ).validate()
298
308
 
299
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
309
+ # Use posixpath to construct stage paths
310
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
311
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
300
312
  local_result_file_name = get_temp_file_path()
301
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
302
313
 
303
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
314
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
304
315
  statement_params = telemetry.get_function_usage_statement_params(
305
316
  project=_PROJECT,
306
317
  subproject=_SUBPROJECT,
@@ -326,6 +337,7 @@ class VarianceThreshold(BaseTransformer):
326
337
  replace=True,
327
338
  session=session,
328
339
  statement_params=statement_params,
340
+ anonymous=True
329
341
  )
330
342
  def fit_wrapper_sproc(
331
343
  session: Session,
@@ -334,7 +346,8 @@ class VarianceThreshold(BaseTransformer):
334
346
  stage_result_file_name: str,
335
347
  input_cols: List[str],
336
348
  label_cols: List[str],
337
- sample_weight_col: Optional[str]
349
+ sample_weight_col: Optional[str],
350
+ statement_params: Dict[str, str]
338
351
  ) -> str:
339
352
  import cloudpickle as cp
340
353
  import numpy as np
@@ -401,15 +414,15 @@ class VarianceThreshold(BaseTransformer):
401
414
  api_calls=[Session.call],
402
415
  custom_tags=dict([("autogen", True)]),
403
416
  )
404
- sproc_export_file_name = session.call(
405
- fit_sproc_name,
417
+ sproc_export_file_name = fit_wrapper_sproc(
418
+ session,
406
419
  query,
407
420
  stage_transform_file_name,
408
421
  stage_result_file_name,
409
422
  identifier.get_unescaped_names(self.input_cols),
410
423
  identifier.get_unescaped_names(self.label_cols),
411
424
  identifier.get_unescaped_names(self.sample_weight_col),
412
- statement_params=statement_params,
425
+ statement_params,
413
426
  )
414
427
 
415
428
  if "|" in sproc_export_file_name:
@@ -419,7 +432,7 @@ class VarianceThreshold(BaseTransformer):
419
432
  print("\n".join(fields[1:]))
420
433
 
421
434
  session.file.get(
422
- os.path.join(stage_result_file_name, sproc_export_file_name),
435
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
423
436
  local_result_file_name,
424
437
  statement_params=statement_params
425
438
  )
@@ -465,7 +478,7 @@ class VarianceThreshold(BaseTransformer):
465
478
 
466
479
  # Register vectorized UDF for batch inference
467
480
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
468
- safe_id=self.id, method=inference_method)
481
+ safe_id=self._get_rand_id(), method=inference_method)
469
482
 
470
483
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
471
484
  # will try to pickle all of self which fails.
@@ -557,7 +570,7 @@ class VarianceThreshold(BaseTransformer):
557
570
  return transformed_pandas_df.to_dict("records")
558
571
 
559
572
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
560
- safe_id=self.id
573
+ safe_id=self._get_rand_id()
561
574
  )
562
575
 
563
576
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -722,11 +735,18 @@ class VarianceThreshold(BaseTransformer):
722
735
  Transformed dataset.
723
736
  """
724
737
  if isinstance(dataset, DataFrame):
738
+ expected_type_inferred = ""
739
+ # when it is classifier, infer the datatype from label columns
740
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
741
+ expected_type_inferred = convert_sp_to_sf_type(
742
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
743
+ )
744
+
725
745
  output_df = self._batch_inference(
726
746
  dataset=dataset,
727
747
  inference_method="predict",
728
748
  expected_output_cols_list=self.output_cols,
729
- expected_output_cols_type="",
749
+ expected_output_cols_type=expected_type_inferred,
730
750
  )
731
751
  elif isinstance(dataset, pd.DataFrame):
732
752
  output_df = self._sklearn_inference(
@@ -799,10 +819,10 @@ class VarianceThreshold(BaseTransformer):
799
819
 
800
820
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
801
821
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
802
- Returns an empty list if current object is not a classifier or not yet fitted.
822
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
803
823
  """
804
824
  if getattr(self._sklearn_object, "classes_", None) is None:
805
- return []
825
+ return [output_cols_prefix]
806
826
 
807
827
  classes = self._sklearn_object.classes_
808
828
  if isinstance(classes, numpy.ndarray):
@@ -1027,7 +1047,7 @@ class VarianceThreshold(BaseTransformer):
1027
1047
  cp.dump(self._sklearn_object, local_score_file)
1028
1048
 
1029
1049
  # Create temp stage to run score.
1030
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1050
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1031
1051
  session = dataset._session
1032
1052
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1033
1053
  SqlResultValidator(
@@ -1041,8 +1061,9 @@ class VarianceThreshold(BaseTransformer):
1041
1061
  expected_value=f"Stage area {score_stage_name} successfully created."
1042
1062
  ).validate()
1043
1063
 
1044
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1045
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1064
+ # Use posixpath to construct stage paths
1065
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1066
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1046
1067
  statement_params = telemetry.get_function_usage_statement_params(
1047
1068
  project=_PROJECT,
1048
1069
  subproject=_SUBPROJECT,
@@ -1068,6 +1089,7 @@ class VarianceThreshold(BaseTransformer):
1068
1089
  replace=True,
1069
1090
  session=session,
1070
1091
  statement_params=statement_params,
1092
+ anonymous=True
1071
1093
  )
1072
1094
  def score_wrapper_sproc(
1073
1095
  session: Session,
@@ -1075,7 +1097,8 @@ class VarianceThreshold(BaseTransformer):
1075
1097
  stage_score_file_name: str,
1076
1098
  input_cols: List[str],
1077
1099
  label_cols: List[str],
1078
- sample_weight_col: Optional[str]
1100
+ sample_weight_col: Optional[str],
1101
+ statement_params: Dict[str, str]
1079
1102
  ) -> float:
1080
1103
  import cloudpickle as cp
1081
1104
  import numpy as np
@@ -1125,14 +1148,14 @@ class VarianceThreshold(BaseTransformer):
1125
1148
  api_calls=[Session.call],
1126
1149
  custom_tags=dict([("autogen", True)]),
1127
1150
  )
1128
- score = session.call(
1129
- score_sproc_name,
1151
+ score = score_wrapper_sproc(
1152
+ session,
1130
1153
  query,
1131
1154
  stage_score_file_name,
1132
1155
  identifier.get_unescaped_names(self.input_cols),
1133
1156
  identifier.get_unescaped_names(self.label_cols),
1134
1157
  identifier.get_unescaped_names(self.sample_weight_col),
1135
- statement_params=statement_params,
1158
+ statement_params,
1136
1159
  )
1137
1160
 
1138
1161
  cleanup_temp_files([local_score_file_name])
@@ -1150,18 +1173,20 @@ class VarianceThreshold(BaseTransformer):
1150
1173
  if self._sklearn_object._estimator_type == 'classifier':
1151
1174
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1152
1175
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1153
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1176
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1177
+ ([] if self._drop_input_cols else inputs) + outputs)
1154
1178
  # For regressor, the type of predict is float64
1155
1179
  elif self._sklearn_object._estimator_type == 'regressor':
1156
1180
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1157
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1158
-
1181
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1182
+ ([] if self._drop_input_cols else inputs) + outputs)
1159
1183
  for prob_func in PROB_FUNCTIONS:
1160
1184
  if hasattr(self, prob_func):
1161
1185
  output_cols_prefix: str = f"{prob_func}_"
1162
1186
  output_column_names = self._get_output_column_names(output_cols_prefix)
1163
1187
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1164
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1188
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1189
+ ([] if self._drop_input_cols else inputs) + outputs)
1165
1190
 
1166
1191
  @property
1167
1192
  def model_signatures(self) -> Dict[str, ModelSignature]: