snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -185,7 +187,6 @@ class VarianceThreshold(BaseTransformer):
185
187
  sample_weight_col: Optional[str] = None,
186
188
  ) -> None:
187
189
  super().__init__()
188
- self.id = str(uuid4()).replace("-", "_").upper()
189
190
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
190
191
 
191
192
  self._deps = list(deps)
@@ -205,6 +206,15 @@ class VarianceThreshold(BaseTransformer):
205
206
  self.set_drop_input_cols(drop_input_cols)
206
207
  self.set_sample_weight_col(sample_weight_col)
207
208
 
209
+ def _get_rand_id(self) -> str:
210
+ """
211
+ Generate random id to be used in sproc and stage names.
212
+
213
+ Returns:
214
+ Random id string usable in sproc, table, and stage names.
215
+ """
216
+ return str(uuid4()).replace("-", "_").upper()
217
+
208
218
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
209
219
  """
210
220
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -283,7 +293,7 @@ class VarianceThreshold(BaseTransformer):
283
293
  cp.dump(self._sklearn_object, local_transform_file)
284
294
 
285
295
  # Create temp stage to run fit.
286
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
296
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
287
297
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
288
298
  SqlResultValidator(
289
299
  session=session,
@@ -296,11 +306,12 @@ class VarianceThreshold(BaseTransformer):
296
306
  expected_value=f"Stage area {transform_stage_name} successfully created."
297
307
  ).validate()
298
308
 
299
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
309
+ # Use posixpath to construct stage paths
310
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
311
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
300
312
  local_result_file_name = get_temp_file_path()
301
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
302
313
 
303
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
314
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
304
315
  statement_params = telemetry.get_function_usage_statement_params(
305
316
  project=_PROJECT,
306
317
  subproject=_SUBPROJECT,
@@ -326,6 +337,7 @@ class VarianceThreshold(BaseTransformer):
326
337
  replace=True,
327
338
  session=session,
328
339
  statement_params=statement_params,
340
+ anonymous=True
329
341
  )
330
342
  def fit_wrapper_sproc(
331
343
  session: Session,
@@ -334,7 +346,8 @@ class VarianceThreshold(BaseTransformer):
334
346
  stage_result_file_name: str,
335
347
  input_cols: List[str],
336
348
  label_cols: List[str],
337
- sample_weight_col: Optional[str]
349
+ sample_weight_col: Optional[str],
350
+ statement_params: Dict[str, str]
338
351
  ) -> str:
339
352
  import cloudpickle as cp
340
353
  import numpy as np
@@ -401,15 +414,15 @@ class VarianceThreshold(BaseTransformer):
401
414
  api_calls=[Session.call],
402
415
  custom_tags=dict([("autogen", True)]),
403
416
  )
404
- sproc_export_file_name = session.call(
405
- fit_sproc_name,
417
+ sproc_export_file_name = fit_wrapper_sproc(
418
+ session,
406
419
  query,
407
420
  stage_transform_file_name,
408
421
  stage_result_file_name,
409
422
  identifier.get_unescaped_names(self.input_cols),
410
423
  identifier.get_unescaped_names(self.label_cols),
411
424
  identifier.get_unescaped_names(self.sample_weight_col),
412
- statement_params=statement_params,
425
+ statement_params,
413
426
  )
414
427
 
415
428
  if "|" in sproc_export_file_name:
@@ -419,7 +432,7 @@ class VarianceThreshold(BaseTransformer):
419
432
  print("\n".join(fields[1:]))
420
433
 
421
434
  session.file.get(
422
- os.path.join(stage_result_file_name, sproc_export_file_name),
435
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
423
436
  local_result_file_name,
424
437
  statement_params=statement_params
425
438
  )
@@ -465,7 +478,7 @@ class VarianceThreshold(BaseTransformer):
465
478
 
466
479
  # Register vectorized UDF for batch inference
467
480
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
468
- safe_id=self.id, method=inference_method)
481
+ safe_id=self._get_rand_id(), method=inference_method)
469
482
 
470
483
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
471
484
  # will try to pickle all of self which fails.
@@ -557,7 +570,7 @@ class VarianceThreshold(BaseTransformer):
557
570
  return transformed_pandas_df.to_dict("records")
558
571
 
559
572
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
560
- safe_id=self.id
573
+ safe_id=self._get_rand_id()
561
574
  )
562
575
 
563
576
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -613,26 +626,37 @@ class VarianceThreshold(BaseTransformer):
613
626
  # input cols need to match unquoted / quoted
614
627
  input_cols = self.input_cols
615
628
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
629
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
616
630
 
617
631
  estimator = self._sklearn_object
618
632
 
619
- input_df = dataset[input_cols] # Select input columns with quoted column names.
620
- if hasattr(estimator, "feature_names_in_"):
621
- missing_features = []
622
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
623
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
624
- missing_features.append(f)
625
-
626
- if len(missing_features) > 0:
627
- raise ValueError(
628
- "The feature names should match with those that were passed during fit.\n"
629
- f"Features seen during fit call but not present in the input: {missing_features}\n"
630
- f"Features in the input dataframe : {input_cols}\n"
631
- )
632
- input_df.columns = getattr(estimator, "feature_names_in_")
633
- else:
634
- # Just rename the column names to unquoted identifiers.
635
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
633
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
634
+ missing_features = []
635
+ features_in_dataset = set(dataset.columns)
636
+ columns_to_select = []
637
+ for i, f in enumerate(features_required_by_estimator):
638
+ if (
639
+ i >= len(input_cols)
640
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
641
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
642
+ and quoted_input_cols[i] not in features_in_dataset)
643
+ ):
644
+ missing_features.append(f)
645
+ elif input_cols[i] in features_in_dataset:
646
+ columns_to_select.append(input_cols[i])
647
+ elif unquoted_input_cols[i] in features_in_dataset:
648
+ columns_to_select.append(unquoted_input_cols[i])
649
+ else:
650
+ columns_to_select.append(quoted_input_cols[i])
651
+
652
+ if len(missing_features) > 0:
653
+ raise ValueError(
654
+ "The feature names should match with those that were passed during fit.\n"
655
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
656
+ f"Features in the input dataframe : {input_cols}\n"
657
+ )
658
+ input_df = dataset[columns_to_select]
659
+ input_df.columns = features_required_by_estimator
636
660
 
637
661
  transformed_numpy_array = getattr(estimator, inference_method)(
638
662
  input_df
@@ -711,11 +735,18 @@ class VarianceThreshold(BaseTransformer):
711
735
  Transformed dataset.
712
736
  """
713
737
  if isinstance(dataset, DataFrame):
738
+ expected_type_inferred = ""
739
+ # when it is classifier, infer the datatype from label columns
740
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
741
+ expected_type_inferred = convert_sp_to_sf_type(
742
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
743
+ )
744
+
714
745
  output_df = self._batch_inference(
715
746
  dataset=dataset,
716
747
  inference_method="predict",
717
748
  expected_output_cols_list=self.output_cols,
718
- expected_output_cols_type="",
749
+ expected_output_cols_type=expected_type_inferred,
719
750
  )
720
751
  elif isinstance(dataset, pd.DataFrame):
721
752
  output_df = self._sklearn_inference(
@@ -788,10 +819,10 @@ class VarianceThreshold(BaseTransformer):
788
819
 
789
820
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
790
821
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
791
- Returns an empty list if current object is not a classifier or not yet fitted.
822
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
792
823
  """
793
824
  if getattr(self._sklearn_object, "classes_", None) is None:
794
- return []
825
+ return [output_cols_prefix]
795
826
 
796
827
  classes = self._sklearn_object.classes_
797
828
  if isinstance(classes, numpy.ndarray):
@@ -1016,7 +1047,7 @@ class VarianceThreshold(BaseTransformer):
1016
1047
  cp.dump(self._sklearn_object, local_score_file)
1017
1048
 
1018
1049
  # Create temp stage to run score.
1019
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1050
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1020
1051
  session = dataset._session
1021
1052
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1022
1053
  SqlResultValidator(
@@ -1030,8 +1061,9 @@ class VarianceThreshold(BaseTransformer):
1030
1061
  expected_value=f"Stage area {score_stage_name} successfully created."
1031
1062
  ).validate()
1032
1063
 
1033
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1034
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1064
+ # Use posixpath to construct stage paths
1065
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1066
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1035
1067
  statement_params = telemetry.get_function_usage_statement_params(
1036
1068
  project=_PROJECT,
1037
1069
  subproject=_SUBPROJECT,
@@ -1057,6 +1089,7 @@ class VarianceThreshold(BaseTransformer):
1057
1089
  replace=True,
1058
1090
  session=session,
1059
1091
  statement_params=statement_params,
1092
+ anonymous=True
1060
1093
  )
1061
1094
  def score_wrapper_sproc(
1062
1095
  session: Session,
@@ -1064,7 +1097,8 @@ class VarianceThreshold(BaseTransformer):
1064
1097
  stage_score_file_name: str,
1065
1098
  input_cols: List[str],
1066
1099
  label_cols: List[str],
1067
- sample_weight_col: Optional[str]
1100
+ sample_weight_col: Optional[str],
1101
+ statement_params: Dict[str, str]
1068
1102
  ) -> float:
1069
1103
  import cloudpickle as cp
1070
1104
  import numpy as np
@@ -1114,14 +1148,14 @@ class VarianceThreshold(BaseTransformer):
1114
1148
  api_calls=[Session.call],
1115
1149
  custom_tags=dict([("autogen", True)]),
1116
1150
  )
1117
- score = session.call(
1118
- score_sproc_name,
1151
+ score = score_wrapper_sproc(
1152
+ session,
1119
1153
  query,
1120
1154
  stage_score_file_name,
1121
1155
  identifier.get_unescaped_names(self.input_cols),
1122
1156
  identifier.get_unescaped_names(self.label_cols),
1123
1157
  identifier.get_unescaped_names(self.sample_weight_col),
1124
- statement_params=statement_params,
1158
+ statement_params,
1125
1159
  )
1126
1160
 
1127
1161
  cleanup_temp_files([local_score_file_name])
@@ -1139,18 +1173,20 @@ class VarianceThreshold(BaseTransformer):
1139
1173
  if self._sklearn_object._estimator_type == 'classifier':
1140
1174
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1141
1175
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1142
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1176
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1177
+ ([] if self._drop_input_cols else inputs) + outputs)
1143
1178
  # For regressor, the type of predict is float64
1144
1179
  elif self._sklearn_object._estimator_type == 'regressor':
1145
1180
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1146
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1147
-
1181
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1182
+ ([] if self._drop_input_cols else inputs) + outputs)
1148
1183
  for prob_func in PROB_FUNCTIONS:
1149
1184
  if hasattr(self, prob_func):
1150
1185
  output_cols_prefix: str = f"{prob_func}_"
1151
1186
  output_column_names = self._get_output_column_names(output_cols_prefix)
1152
1187
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1153
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1188
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1189
+ ([] if self._drop_input_cols else inputs) + outputs)
1154
1190
 
1155
1191
  @property
1156
1192
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -270,7 +272,6 @@ class GaussianProcessClassifier(BaseTransformer):
270
272
  sample_weight_col: Optional[str] = None,
271
273
  ) -> None:
272
274
  super().__init__()
273
- self.id = str(uuid4()).replace("-", "_").upper()
274
275
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
275
276
 
276
277
  self._deps = list(deps)
@@ -298,6 +299,15 @@ class GaussianProcessClassifier(BaseTransformer):
298
299
  self.set_drop_input_cols(drop_input_cols)
299
300
  self.set_sample_weight_col(sample_weight_col)
300
301
 
302
+ def _get_rand_id(self) -> str:
303
+ """
304
+ Generate random id to be used in sproc and stage names.
305
+
306
+ Returns:
307
+ Random id string usable in sproc, table, and stage names.
308
+ """
309
+ return str(uuid4()).replace("-", "_").upper()
310
+
301
311
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
302
312
  """
303
313
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -376,7 +386,7 @@ class GaussianProcessClassifier(BaseTransformer):
376
386
  cp.dump(self._sklearn_object, local_transform_file)
377
387
 
378
388
  # Create temp stage to run fit.
379
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
389
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
380
390
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
381
391
  SqlResultValidator(
382
392
  session=session,
@@ -389,11 +399,12 @@ class GaussianProcessClassifier(BaseTransformer):
389
399
  expected_value=f"Stage area {transform_stage_name} successfully created."
390
400
  ).validate()
391
401
 
392
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
402
+ # Use posixpath to construct stage paths
403
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
404
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
393
405
  local_result_file_name = get_temp_file_path()
394
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
395
406
 
396
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
407
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
397
408
  statement_params = telemetry.get_function_usage_statement_params(
398
409
  project=_PROJECT,
399
410
  subproject=_SUBPROJECT,
@@ -419,6 +430,7 @@ class GaussianProcessClassifier(BaseTransformer):
419
430
  replace=True,
420
431
  session=session,
421
432
  statement_params=statement_params,
433
+ anonymous=True
422
434
  )
423
435
  def fit_wrapper_sproc(
424
436
  session: Session,
@@ -427,7 +439,8 @@ class GaussianProcessClassifier(BaseTransformer):
427
439
  stage_result_file_name: str,
428
440
  input_cols: List[str],
429
441
  label_cols: List[str],
430
- sample_weight_col: Optional[str]
442
+ sample_weight_col: Optional[str],
443
+ statement_params: Dict[str, str]
431
444
  ) -> str:
432
445
  import cloudpickle as cp
433
446
  import numpy as np
@@ -494,15 +507,15 @@ class GaussianProcessClassifier(BaseTransformer):
494
507
  api_calls=[Session.call],
495
508
  custom_tags=dict([("autogen", True)]),
496
509
  )
497
- sproc_export_file_name = session.call(
498
- fit_sproc_name,
510
+ sproc_export_file_name = fit_wrapper_sproc(
511
+ session,
499
512
  query,
500
513
  stage_transform_file_name,
501
514
  stage_result_file_name,
502
515
  identifier.get_unescaped_names(self.input_cols),
503
516
  identifier.get_unescaped_names(self.label_cols),
504
517
  identifier.get_unescaped_names(self.sample_weight_col),
505
- statement_params=statement_params,
518
+ statement_params,
506
519
  )
507
520
 
508
521
  if "|" in sproc_export_file_name:
@@ -512,7 +525,7 @@ class GaussianProcessClassifier(BaseTransformer):
512
525
  print("\n".join(fields[1:]))
513
526
 
514
527
  session.file.get(
515
- os.path.join(stage_result_file_name, sproc_export_file_name),
528
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
516
529
  local_result_file_name,
517
530
  statement_params=statement_params
518
531
  )
@@ -558,7 +571,7 @@ class GaussianProcessClassifier(BaseTransformer):
558
571
 
559
572
  # Register vectorized UDF for batch inference
560
573
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
561
- safe_id=self.id, method=inference_method)
574
+ safe_id=self._get_rand_id(), method=inference_method)
562
575
 
563
576
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
564
577
  # will try to pickle all of self which fails.
@@ -650,7 +663,7 @@ class GaussianProcessClassifier(BaseTransformer):
650
663
  return transformed_pandas_df.to_dict("records")
651
664
 
652
665
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
653
- safe_id=self.id
666
+ safe_id=self._get_rand_id()
654
667
  )
655
668
 
656
669
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -706,26 +719,37 @@ class GaussianProcessClassifier(BaseTransformer):
706
719
  # input cols need to match unquoted / quoted
707
720
  input_cols = self.input_cols
708
721
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
722
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
709
723
 
710
724
  estimator = self._sklearn_object
711
725
 
712
- input_df = dataset[input_cols] # Select input columns with quoted column names.
713
- if hasattr(estimator, "feature_names_in_"):
714
- missing_features = []
715
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
716
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
717
- missing_features.append(f)
718
-
719
- if len(missing_features) > 0:
720
- raise ValueError(
721
- "The feature names should match with those that were passed during fit.\n"
722
- f"Features seen during fit call but not present in the input: {missing_features}\n"
723
- f"Features in the input dataframe : {input_cols}\n"
724
- )
725
- input_df.columns = getattr(estimator, "feature_names_in_")
726
- else:
727
- # Just rename the column names to unquoted identifiers.
728
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
726
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
727
+ missing_features = []
728
+ features_in_dataset = set(dataset.columns)
729
+ columns_to_select = []
730
+ for i, f in enumerate(features_required_by_estimator):
731
+ if (
732
+ i >= len(input_cols)
733
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
734
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
735
+ and quoted_input_cols[i] not in features_in_dataset)
736
+ ):
737
+ missing_features.append(f)
738
+ elif input_cols[i] in features_in_dataset:
739
+ columns_to_select.append(input_cols[i])
740
+ elif unquoted_input_cols[i] in features_in_dataset:
741
+ columns_to_select.append(unquoted_input_cols[i])
742
+ else:
743
+ columns_to_select.append(quoted_input_cols[i])
744
+
745
+ if len(missing_features) > 0:
746
+ raise ValueError(
747
+ "The feature names should match with those that were passed during fit.\n"
748
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
749
+ f"Features in the input dataframe : {input_cols}\n"
750
+ )
751
+ input_df = dataset[columns_to_select]
752
+ input_df.columns = features_required_by_estimator
729
753
 
730
754
  transformed_numpy_array = getattr(estimator, inference_method)(
731
755
  input_df
@@ -806,11 +830,18 @@ class GaussianProcessClassifier(BaseTransformer):
806
830
  Transformed dataset.
807
831
  """
808
832
  if isinstance(dataset, DataFrame):
833
+ expected_type_inferred = ""
834
+ # when it is classifier, infer the datatype from label columns
835
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
836
+ expected_type_inferred = convert_sp_to_sf_type(
837
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
838
+ )
839
+
809
840
  output_df = self._batch_inference(
810
841
  dataset=dataset,
811
842
  inference_method="predict",
812
843
  expected_output_cols_list=self.output_cols,
813
- expected_output_cols_type="",
844
+ expected_output_cols_type=expected_type_inferred,
814
845
  )
815
846
  elif isinstance(dataset, pd.DataFrame):
816
847
  output_df = self._sklearn_inference(
@@ -881,10 +912,10 @@ class GaussianProcessClassifier(BaseTransformer):
881
912
 
882
913
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
883
914
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
884
- Returns an empty list if current object is not a classifier or not yet fitted.
915
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
885
916
  """
886
917
  if getattr(self._sklearn_object, "classes_", None) is None:
887
- return []
918
+ return [output_cols_prefix]
888
919
 
889
920
  classes = self._sklearn_object.classes_
890
921
  if isinstance(classes, numpy.ndarray):
@@ -1113,7 +1144,7 @@ class GaussianProcessClassifier(BaseTransformer):
1113
1144
  cp.dump(self._sklearn_object, local_score_file)
1114
1145
 
1115
1146
  # Create temp stage to run score.
1116
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1147
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1117
1148
  session = dataset._session
1118
1149
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1119
1150
  SqlResultValidator(
@@ -1127,8 +1158,9 @@ class GaussianProcessClassifier(BaseTransformer):
1127
1158
  expected_value=f"Stage area {score_stage_name} successfully created."
1128
1159
  ).validate()
1129
1160
 
1130
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1131
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1161
+ # Use posixpath to construct stage paths
1162
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1163
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1132
1164
  statement_params = telemetry.get_function_usage_statement_params(
1133
1165
  project=_PROJECT,
1134
1166
  subproject=_SUBPROJECT,
@@ -1154,6 +1186,7 @@ class GaussianProcessClassifier(BaseTransformer):
1154
1186
  replace=True,
1155
1187
  session=session,
1156
1188
  statement_params=statement_params,
1189
+ anonymous=True
1157
1190
  )
1158
1191
  def score_wrapper_sproc(
1159
1192
  session: Session,
@@ -1161,7 +1194,8 @@ class GaussianProcessClassifier(BaseTransformer):
1161
1194
  stage_score_file_name: str,
1162
1195
  input_cols: List[str],
1163
1196
  label_cols: List[str],
1164
- sample_weight_col: Optional[str]
1197
+ sample_weight_col: Optional[str],
1198
+ statement_params: Dict[str, str]
1165
1199
  ) -> float:
1166
1200
  import cloudpickle as cp
1167
1201
  import numpy as np
@@ -1211,14 +1245,14 @@ class GaussianProcessClassifier(BaseTransformer):
1211
1245
  api_calls=[Session.call],
1212
1246
  custom_tags=dict([("autogen", True)]),
1213
1247
  )
1214
- score = session.call(
1215
- score_sproc_name,
1248
+ score = score_wrapper_sproc(
1249
+ session,
1216
1250
  query,
1217
1251
  stage_score_file_name,
1218
1252
  identifier.get_unescaped_names(self.input_cols),
1219
1253
  identifier.get_unescaped_names(self.label_cols),
1220
1254
  identifier.get_unescaped_names(self.sample_weight_col),
1221
- statement_params=statement_params,
1255
+ statement_params,
1222
1256
  )
1223
1257
 
1224
1258
  cleanup_temp_files([local_score_file_name])
@@ -1236,18 +1270,20 @@ class GaussianProcessClassifier(BaseTransformer):
1236
1270
  if self._sklearn_object._estimator_type == 'classifier':
1237
1271
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1238
1272
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1239
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1273
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1274
+ ([] if self._drop_input_cols else inputs) + outputs)
1240
1275
  # For regressor, the type of predict is float64
1241
1276
  elif self._sklearn_object._estimator_type == 'regressor':
1242
1277
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1243
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1244
-
1278
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1279
+ ([] if self._drop_input_cols else inputs) + outputs)
1245
1280
  for prob_func in PROB_FUNCTIONS:
1246
1281
  if hasattr(self, prob_func):
1247
1282
  output_cols_prefix: str = f"{prob_func}_"
1248
1283
  output_column_names = self._get_output_column_names(output_cols_prefix)
1249
1284
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1250
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1285
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1286
+ ([] if self._drop_input_cols else inputs) + outputs)
1251
1287
 
1252
1288
  @property
1253
1289
  def model_signatures(self) -> Dict[str, ModelSignature]: