snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -255,7 +257,6 @@ class GaussianProcessRegressor(BaseTransformer):
255
257
  sample_weight_col: Optional[str] = None,
256
258
  ) -> None:
257
259
  super().__init__()
258
- self.id = str(uuid4()).replace("-", "_").upper()
259
260
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
260
261
 
261
262
  self._deps = list(deps)
@@ -281,6 +282,15 @@ class GaussianProcessRegressor(BaseTransformer):
281
282
  self.set_drop_input_cols(drop_input_cols)
282
283
  self.set_sample_weight_col(sample_weight_col)
283
284
 
285
+ def _get_rand_id(self) -> str:
286
+ """
287
+ Generate random id to be used in sproc and stage names.
288
+
289
+ Returns:
290
+ Random id string usable in sproc, table, and stage names.
291
+ """
292
+ return str(uuid4()).replace("-", "_").upper()
293
+
284
294
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
285
295
  """
286
296
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -359,7 +369,7 @@ class GaussianProcessRegressor(BaseTransformer):
359
369
  cp.dump(self._sklearn_object, local_transform_file)
360
370
 
361
371
  # Create temp stage to run fit.
362
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
372
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
363
373
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
364
374
  SqlResultValidator(
365
375
  session=session,
@@ -372,11 +382,12 @@ class GaussianProcessRegressor(BaseTransformer):
372
382
  expected_value=f"Stage area {transform_stage_name} successfully created."
373
383
  ).validate()
374
384
 
375
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
385
+ # Use posixpath to construct stage paths
386
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
387
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
376
388
  local_result_file_name = get_temp_file_path()
377
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
378
389
 
379
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
390
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
380
391
  statement_params = telemetry.get_function_usage_statement_params(
381
392
  project=_PROJECT,
382
393
  subproject=_SUBPROJECT,
@@ -402,6 +413,7 @@ class GaussianProcessRegressor(BaseTransformer):
402
413
  replace=True,
403
414
  session=session,
404
415
  statement_params=statement_params,
416
+ anonymous=True
405
417
  )
406
418
  def fit_wrapper_sproc(
407
419
  session: Session,
@@ -410,7 +422,8 @@ class GaussianProcessRegressor(BaseTransformer):
410
422
  stage_result_file_name: str,
411
423
  input_cols: List[str],
412
424
  label_cols: List[str],
413
- sample_weight_col: Optional[str]
425
+ sample_weight_col: Optional[str],
426
+ statement_params: Dict[str, str]
414
427
  ) -> str:
415
428
  import cloudpickle as cp
416
429
  import numpy as np
@@ -477,15 +490,15 @@ class GaussianProcessRegressor(BaseTransformer):
477
490
  api_calls=[Session.call],
478
491
  custom_tags=dict([("autogen", True)]),
479
492
  )
480
- sproc_export_file_name = session.call(
481
- fit_sproc_name,
493
+ sproc_export_file_name = fit_wrapper_sproc(
494
+ session,
482
495
  query,
483
496
  stage_transform_file_name,
484
497
  stage_result_file_name,
485
498
  identifier.get_unescaped_names(self.input_cols),
486
499
  identifier.get_unescaped_names(self.label_cols),
487
500
  identifier.get_unescaped_names(self.sample_weight_col),
488
- statement_params=statement_params,
501
+ statement_params,
489
502
  )
490
503
 
491
504
  if "|" in sproc_export_file_name:
@@ -495,7 +508,7 @@ class GaussianProcessRegressor(BaseTransformer):
495
508
  print("\n".join(fields[1:]))
496
509
 
497
510
  session.file.get(
498
- os.path.join(stage_result_file_name, sproc_export_file_name),
511
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
499
512
  local_result_file_name,
500
513
  statement_params=statement_params
501
514
  )
@@ -541,7 +554,7 @@ class GaussianProcessRegressor(BaseTransformer):
541
554
 
542
555
  # Register vectorized UDF for batch inference
543
556
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
544
- safe_id=self.id, method=inference_method)
557
+ safe_id=self._get_rand_id(), method=inference_method)
545
558
 
546
559
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
547
560
  # will try to pickle all of self which fails.
@@ -633,7 +646,7 @@ class GaussianProcessRegressor(BaseTransformer):
633
646
  return transformed_pandas_df.to_dict("records")
634
647
 
635
648
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
636
- safe_id=self.id
649
+ safe_id=self._get_rand_id()
637
650
  )
638
651
 
639
652
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -689,26 +702,37 @@ class GaussianProcessRegressor(BaseTransformer):
689
702
  # input cols need to match unquoted / quoted
690
703
  input_cols = self.input_cols
691
704
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
705
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
692
706
 
693
707
  estimator = self._sklearn_object
694
708
 
695
- input_df = dataset[input_cols] # Select input columns with quoted column names.
696
- if hasattr(estimator, "feature_names_in_"):
697
- missing_features = []
698
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
699
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
700
- missing_features.append(f)
701
-
702
- if len(missing_features) > 0:
703
- raise ValueError(
704
- "The feature names should match with those that were passed during fit.\n"
705
- f"Features seen during fit call but not present in the input: {missing_features}\n"
706
- f"Features in the input dataframe : {input_cols}\n"
707
- )
708
- input_df.columns = getattr(estimator, "feature_names_in_")
709
- else:
710
- # Just rename the column names to unquoted identifiers.
711
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
709
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
710
+ missing_features = []
711
+ features_in_dataset = set(dataset.columns)
712
+ columns_to_select = []
713
+ for i, f in enumerate(features_required_by_estimator):
714
+ if (
715
+ i >= len(input_cols)
716
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
717
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
718
+ and quoted_input_cols[i] not in features_in_dataset)
719
+ ):
720
+ missing_features.append(f)
721
+ elif input_cols[i] in features_in_dataset:
722
+ columns_to_select.append(input_cols[i])
723
+ elif unquoted_input_cols[i] in features_in_dataset:
724
+ columns_to_select.append(unquoted_input_cols[i])
725
+ else:
726
+ columns_to_select.append(quoted_input_cols[i])
727
+
728
+ if len(missing_features) > 0:
729
+ raise ValueError(
730
+ "The feature names should match with those that were passed during fit.\n"
731
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
732
+ f"Features in the input dataframe : {input_cols}\n"
733
+ )
734
+ input_df = dataset[columns_to_select]
735
+ input_df.columns = features_required_by_estimator
712
736
 
713
737
  transformed_numpy_array = getattr(estimator, inference_method)(
714
738
  input_df
@@ -789,11 +813,18 @@ class GaussianProcessRegressor(BaseTransformer):
789
813
  Transformed dataset.
790
814
  """
791
815
  if isinstance(dataset, DataFrame):
816
+ expected_type_inferred = "float"
817
+ # when it is classifier, infer the datatype from label columns
818
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
819
+ expected_type_inferred = convert_sp_to_sf_type(
820
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
821
+ )
822
+
792
823
  output_df = self._batch_inference(
793
824
  dataset=dataset,
794
825
  inference_method="predict",
795
826
  expected_output_cols_list=self.output_cols,
796
- expected_output_cols_type="float",
827
+ expected_output_cols_type=expected_type_inferred,
797
828
  )
798
829
  elif isinstance(dataset, pd.DataFrame):
799
830
  output_df = self._sklearn_inference(
@@ -864,10 +895,10 @@ class GaussianProcessRegressor(BaseTransformer):
864
895
 
865
896
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
866
897
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
867
- Returns an empty list if current object is not a classifier or not yet fitted.
898
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
868
899
  """
869
900
  if getattr(self._sklearn_object, "classes_", None) is None:
870
- return []
901
+ return [output_cols_prefix]
871
902
 
872
903
  classes = self._sklearn_object.classes_
873
904
  if isinstance(classes, numpy.ndarray):
@@ -1092,7 +1123,7 @@ class GaussianProcessRegressor(BaseTransformer):
1092
1123
  cp.dump(self._sklearn_object, local_score_file)
1093
1124
 
1094
1125
  # Create temp stage to run score.
1095
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1126
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1096
1127
  session = dataset._session
1097
1128
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1098
1129
  SqlResultValidator(
@@ -1106,8 +1137,9 @@ class GaussianProcessRegressor(BaseTransformer):
1106
1137
  expected_value=f"Stage area {score_stage_name} successfully created."
1107
1138
  ).validate()
1108
1139
 
1109
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1110
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1140
+ # Use posixpath to construct stage paths
1141
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1142
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1111
1143
  statement_params = telemetry.get_function_usage_statement_params(
1112
1144
  project=_PROJECT,
1113
1145
  subproject=_SUBPROJECT,
@@ -1133,6 +1165,7 @@ class GaussianProcessRegressor(BaseTransformer):
1133
1165
  replace=True,
1134
1166
  session=session,
1135
1167
  statement_params=statement_params,
1168
+ anonymous=True
1136
1169
  )
1137
1170
  def score_wrapper_sproc(
1138
1171
  session: Session,
@@ -1140,7 +1173,8 @@ class GaussianProcessRegressor(BaseTransformer):
1140
1173
  stage_score_file_name: str,
1141
1174
  input_cols: List[str],
1142
1175
  label_cols: List[str],
1143
- sample_weight_col: Optional[str]
1176
+ sample_weight_col: Optional[str],
1177
+ statement_params: Dict[str, str]
1144
1178
  ) -> float:
1145
1179
  import cloudpickle as cp
1146
1180
  import numpy as np
@@ -1190,14 +1224,14 @@ class GaussianProcessRegressor(BaseTransformer):
1190
1224
  api_calls=[Session.call],
1191
1225
  custom_tags=dict([("autogen", True)]),
1192
1226
  )
1193
- score = session.call(
1194
- score_sproc_name,
1227
+ score = score_wrapper_sproc(
1228
+ session,
1195
1229
  query,
1196
1230
  stage_score_file_name,
1197
1231
  identifier.get_unescaped_names(self.input_cols),
1198
1232
  identifier.get_unescaped_names(self.label_cols),
1199
1233
  identifier.get_unescaped_names(self.sample_weight_col),
1200
- statement_params=statement_params,
1234
+ statement_params,
1201
1235
  )
1202
1236
 
1203
1237
  cleanup_temp_files([local_score_file_name])
@@ -1215,18 +1249,20 @@ class GaussianProcessRegressor(BaseTransformer):
1215
1249
  if self._sklearn_object._estimator_type == 'classifier':
1216
1250
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1217
1251
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1218
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1252
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1253
+ ([] if self._drop_input_cols else inputs) + outputs)
1219
1254
  # For regressor, the type of predict is float64
1220
1255
  elif self._sklearn_object._estimator_type == 'regressor':
1221
1256
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1222
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1223
-
1257
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1258
+ ([] if self._drop_input_cols else inputs) + outputs)
1224
1259
  for prob_func in PROB_FUNCTIONS:
1225
1260
  if hasattr(self, prob_func):
1226
1261
  output_cols_prefix: str = f"{prob_func}_"
1227
1262
  output_column_names = self._get_output_column_names(output_cols_prefix)
1228
1263
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1229
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1264
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1265
+ ([] if self._drop_input_cols else inputs) + outputs)
1230
1266
 
1231
1267
  @property
1232
1268
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -28,6 +29,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
28
29
  from snowflake.snowpark import DataFrame, Session
29
30
  from snowflake.snowpark.functions import pandas_udf, sproc
30
31
  from snowflake.snowpark.types import PandasSeries
32
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
31
33
 
32
34
  from snowflake.ml.model.model_signature import (
33
35
  DataType,
@@ -290,7 +292,6 @@ class IterativeImputer(BaseTransformer):
290
292
  sample_weight_col: Optional[str] = None,
291
293
  ) -> None:
292
294
  super().__init__()
293
- self.id = str(uuid4()).replace("-", "_").upper()
294
295
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
295
296
  deps = deps | _gather_dependencies(estimator)
296
297
  self._deps = list(deps)
@@ -324,6 +325,15 @@ class IterativeImputer(BaseTransformer):
324
325
  self.set_drop_input_cols(drop_input_cols)
325
326
  self.set_sample_weight_col(sample_weight_col)
326
327
 
328
+ def _get_rand_id(self) -> str:
329
+ """
330
+ Generate random id to be used in sproc and stage names.
331
+
332
+ Returns:
333
+ Random id string usable in sproc, table, and stage names.
334
+ """
335
+ return str(uuid4()).replace("-", "_").upper()
336
+
327
337
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
328
338
  """
329
339
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -402,7 +412,7 @@ class IterativeImputer(BaseTransformer):
402
412
  cp.dump(self._sklearn_object, local_transform_file)
403
413
 
404
414
  # Create temp stage to run fit.
405
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
415
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
406
416
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
407
417
  SqlResultValidator(
408
418
  session=session,
@@ -415,11 +425,12 @@ class IterativeImputer(BaseTransformer):
415
425
  expected_value=f"Stage area {transform_stage_name} successfully created."
416
426
  ).validate()
417
427
 
418
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
428
+ # Use posixpath to construct stage paths
429
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
430
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
419
431
  local_result_file_name = get_temp_file_path()
420
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
421
432
 
422
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
433
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
423
434
  statement_params = telemetry.get_function_usage_statement_params(
424
435
  project=_PROJECT,
425
436
  subproject=_SUBPROJECT,
@@ -445,6 +456,7 @@ class IterativeImputer(BaseTransformer):
445
456
  replace=True,
446
457
  session=session,
447
458
  statement_params=statement_params,
459
+ anonymous=True
448
460
  )
449
461
  def fit_wrapper_sproc(
450
462
  session: Session,
@@ -453,7 +465,8 @@ class IterativeImputer(BaseTransformer):
453
465
  stage_result_file_name: str,
454
466
  input_cols: List[str],
455
467
  label_cols: List[str],
456
- sample_weight_col: Optional[str]
468
+ sample_weight_col: Optional[str],
469
+ statement_params: Dict[str, str]
457
470
  ) -> str:
458
471
  import cloudpickle as cp
459
472
  import numpy as np
@@ -520,15 +533,15 @@ class IterativeImputer(BaseTransformer):
520
533
  api_calls=[Session.call],
521
534
  custom_tags=dict([("autogen", True)]),
522
535
  )
523
- sproc_export_file_name = session.call(
524
- fit_sproc_name,
536
+ sproc_export_file_name = fit_wrapper_sproc(
537
+ session,
525
538
  query,
526
539
  stage_transform_file_name,
527
540
  stage_result_file_name,
528
541
  identifier.get_unescaped_names(self.input_cols),
529
542
  identifier.get_unescaped_names(self.label_cols),
530
543
  identifier.get_unescaped_names(self.sample_weight_col),
531
- statement_params=statement_params,
544
+ statement_params,
532
545
  )
533
546
 
534
547
  if "|" in sproc_export_file_name:
@@ -538,7 +551,7 @@ class IterativeImputer(BaseTransformer):
538
551
  print("\n".join(fields[1:]))
539
552
 
540
553
  session.file.get(
541
- os.path.join(stage_result_file_name, sproc_export_file_name),
554
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
542
555
  local_result_file_name,
543
556
  statement_params=statement_params
544
557
  )
@@ -584,7 +597,7 @@ class IterativeImputer(BaseTransformer):
584
597
 
585
598
  # Register vectorized UDF for batch inference
586
599
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
587
- safe_id=self.id, method=inference_method)
600
+ safe_id=self._get_rand_id(), method=inference_method)
588
601
 
589
602
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
590
603
  # will try to pickle all of self which fails.
@@ -676,7 +689,7 @@ class IterativeImputer(BaseTransformer):
676
689
  return transformed_pandas_df.to_dict("records")
677
690
 
678
691
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
679
- safe_id=self.id
692
+ safe_id=self._get_rand_id()
680
693
  )
681
694
 
682
695
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -732,26 +745,37 @@ class IterativeImputer(BaseTransformer):
732
745
  # input cols need to match unquoted / quoted
733
746
  input_cols = self.input_cols
734
747
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
748
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
735
749
 
736
750
  estimator = self._sklearn_object
737
751
 
738
- input_df = dataset[input_cols] # Select input columns with quoted column names.
739
- if hasattr(estimator, "feature_names_in_"):
740
- missing_features = []
741
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
742
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
743
- missing_features.append(f)
744
-
745
- if len(missing_features) > 0:
746
- raise ValueError(
747
- "The feature names should match with those that were passed during fit.\n"
748
- f"Features seen during fit call but not present in the input: {missing_features}\n"
749
- f"Features in the input dataframe : {input_cols}\n"
750
- )
751
- input_df.columns = getattr(estimator, "feature_names_in_")
752
- else:
753
- # Just rename the column names to unquoted identifiers.
754
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
752
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
753
+ missing_features = []
754
+ features_in_dataset = set(dataset.columns)
755
+ columns_to_select = []
756
+ for i, f in enumerate(features_required_by_estimator):
757
+ if (
758
+ i >= len(input_cols)
759
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
760
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
761
+ and quoted_input_cols[i] not in features_in_dataset)
762
+ ):
763
+ missing_features.append(f)
764
+ elif input_cols[i] in features_in_dataset:
765
+ columns_to_select.append(input_cols[i])
766
+ elif unquoted_input_cols[i] in features_in_dataset:
767
+ columns_to_select.append(unquoted_input_cols[i])
768
+ else:
769
+ columns_to_select.append(quoted_input_cols[i])
770
+
771
+ if len(missing_features) > 0:
772
+ raise ValueError(
773
+ "The feature names should match with those that were passed during fit.\n"
774
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
775
+ f"Features in the input dataframe : {input_cols}\n"
776
+ )
777
+ input_df = dataset[columns_to_select]
778
+ input_df.columns = features_required_by_estimator
755
779
 
756
780
  transformed_numpy_array = getattr(estimator, inference_method)(
757
781
  input_df
@@ -830,11 +854,18 @@ class IterativeImputer(BaseTransformer):
830
854
  Transformed dataset.
831
855
  """
832
856
  if isinstance(dataset, DataFrame):
857
+ expected_type_inferred = ""
858
+ # when it is classifier, infer the datatype from label columns
859
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
860
+ expected_type_inferred = convert_sp_to_sf_type(
861
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
862
+ )
863
+
833
864
  output_df = self._batch_inference(
834
865
  dataset=dataset,
835
866
  inference_method="predict",
836
867
  expected_output_cols_list=self.output_cols,
837
- expected_output_cols_type="",
868
+ expected_output_cols_type=expected_type_inferred,
838
869
  )
839
870
  elif isinstance(dataset, pd.DataFrame):
840
871
  output_df = self._sklearn_inference(
@@ -907,10 +938,10 @@ class IterativeImputer(BaseTransformer):
907
938
 
908
939
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
909
940
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
910
- Returns an empty list if current object is not a classifier or not yet fitted.
941
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
911
942
  """
912
943
  if getattr(self._sklearn_object, "classes_", None) is None:
913
- return []
944
+ return [output_cols_prefix]
914
945
 
915
946
  classes = self._sklearn_object.classes_
916
947
  if isinstance(classes, numpy.ndarray):
@@ -1135,7 +1166,7 @@ class IterativeImputer(BaseTransformer):
1135
1166
  cp.dump(self._sklearn_object, local_score_file)
1136
1167
 
1137
1168
  # Create temp stage to run score.
1138
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1169
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1139
1170
  session = dataset._session
1140
1171
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1141
1172
  SqlResultValidator(
@@ -1149,8 +1180,9 @@ class IterativeImputer(BaseTransformer):
1149
1180
  expected_value=f"Stage area {score_stage_name} successfully created."
1150
1181
  ).validate()
1151
1182
 
1152
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1153
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1183
+ # Use posixpath to construct stage paths
1184
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1185
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1154
1186
  statement_params = telemetry.get_function_usage_statement_params(
1155
1187
  project=_PROJECT,
1156
1188
  subproject=_SUBPROJECT,
@@ -1176,6 +1208,7 @@ class IterativeImputer(BaseTransformer):
1176
1208
  replace=True,
1177
1209
  session=session,
1178
1210
  statement_params=statement_params,
1211
+ anonymous=True
1179
1212
  )
1180
1213
  def score_wrapper_sproc(
1181
1214
  session: Session,
@@ -1183,7 +1216,8 @@ class IterativeImputer(BaseTransformer):
1183
1216
  stage_score_file_name: str,
1184
1217
  input_cols: List[str],
1185
1218
  label_cols: List[str],
1186
- sample_weight_col: Optional[str]
1219
+ sample_weight_col: Optional[str],
1220
+ statement_params: Dict[str, str]
1187
1221
  ) -> float:
1188
1222
  import cloudpickle as cp
1189
1223
  import numpy as np
@@ -1233,14 +1267,14 @@ class IterativeImputer(BaseTransformer):
1233
1267
  api_calls=[Session.call],
1234
1268
  custom_tags=dict([("autogen", True)]),
1235
1269
  )
1236
- score = session.call(
1237
- score_sproc_name,
1270
+ score = score_wrapper_sproc(
1271
+ session,
1238
1272
  query,
1239
1273
  stage_score_file_name,
1240
1274
  identifier.get_unescaped_names(self.input_cols),
1241
1275
  identifier.get_unescaped_names(self.label_cols),
1242
1276
  identifier.get_unescaped_names(self.sample_weight_col),
1243
- statement_params=statement_params,
1277
+ statement_params,
1244
1278
  )
1245
1279
 
1246
1280
  cleanup_temp_files([local_score_file_name])
@@ -1258,18 +1292,20 @@ class IterativeImputer(BaseTransformer):
1258
1292
  if self._sklearn_object._estimator_type == 'classifier':
1259
1293
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1260
1294
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1261
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1295
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1296
+ ([] if self._drop_input_cols else inputs) + outputs)
1262
1297
  # For regressor, the type of predict is float64
1263
1298
  elif self._sklearn_object._estimator_type == 'regressor':
1264
1299
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1265
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1266
-
1300
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1301
+ ([] if self._drop_input_cols else inputs) + outputs)
1267
1302
  for prob_func in PROB_FUNCTIONS:
1268
1303
  if hasattr(self, prob_func):
1269
1304
  output_cols_prefix: str = f"{prob_func}_"
1270
1305
  output_column_names = self._get_output_column_names(output_cols_prefix)
1271
1306
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1272
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1307
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1308
+ ([] if self._drop_input_cols else inputs) + outputs)
1273
1309
 
1274
1310
  @property
1275
1311
  def model_signatures(self) -> Dict[str, ModelSignature]: