snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -368,7 +370,6 @@ class GradientBoostingRegressor(BaseTransformer):
368
370
  sample_weight_col: Optional[str] = None,
369
371
  ) -> None:
370
372
  super().__init__()
371
- self.id = str(uuid4()).replace("-", "_").upper()
372
373
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
373
374
 
374
375
  self._deps = list(deps)
@@ -408,6 +409,15 @@ class GradientBoostingRegressor(BaseTransformer):
408
409
  self.set_drop_input_cols(drop_input_cols)
409
410
  self.set_sample_weight_col(sample_weight_col)
410
411
 
412
+ def _get_rand_id(self) -> str:
413
+ """
414
+ Generate random id to be used in sproc and stage names.
415
+
416
+ Returns:
417
+ Random id string usable in sproc, table, and stage names.
418
+ """
419
+ return str(uuid4()).replace("-", "_").upper()
420
+
411
421
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
412
422
  """
413
423
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -486,7 +496,7 @@ class GradientBoostingRegressor(BaseTransformer):
486
496
  cp.dump(self._sklearn_object, local_transform_file)
487
497
 
488
498
  # Create temp stage to run fit.
489
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
499
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
490
500
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
491
501
  SqlResultValidator(
492
502
  session=session,
@@ -499,11 +509,12 @@ class GradientBoostingRegressor(BaseTransformer):
499
509
  expected_value=f"Stage area {transform_stage_name} successfully created."
500
510
  ).validate()
501
511
 
502
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
512
+ # Use posixpath to construct stage paths
513
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
514
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
503
515
  local_result_file_name = get_temp_file_path()
504
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
505
516
 
506
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
517
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
507
518
  statement_params = telemetry.get_function_usage_statement_params(
508
519
  project=_PROJECT,
509
520
  subproject=_SUBPROJECT,
@@ -529,6 +540,7 @@ class GradientBoostingRegressor(BaseTransformer):
529
540
  replace=True,
530
541
  session=session,
531
542
  statement_params=statement_params,
543
+ anonymous=True
532
544
  )
533
545
  def fit_wrapper_sproc(
534
546
  session: Session,
@@ -537,7 +549,8 @@ class GradientBoostingRegressor(BaseTransformer):
537
549
  stage_result_file_name: str,
538
550
  input_cols: List[str],
539
551
  label_cols: List[str],
540
- sample_weight_col: Optional[str]
552
+ sample_weight_col: Optional[str],
553
+ statement_params: Dict[str, str]
541
554
  ) -> str:
542
555
  import cloudpickle as cp
543
556
  import numpy as np
@@ -604,15 +617,15 @@ class GradientBoostingRegressor(BaseTransformer):
604
617
  api_calls=[Session.call],
605
618
  custom_tags=dict([("autogen", True)]),
606
619
  )
607
- sproc_export_file_name = session.call(
608
- fit_sproc_name,
620
+ sproc_export_file_name = fit_wrapper_sproc(
621
+ session,
609
622
  query,
610
623
  stage_transform_file_name,
611
624
  stage_result_file_name,
612
625
  identifier.get_unescaped_names(self.input_cols),
613
626
  identifier.get_unescaped_names(self.label_cols),
614
627
  identifier.get_unescaped_names(self.sample_weight_col),
615
- statement_params=statement_params,
628
+ statement_params,
616
629
  )
617
630
 
618
631
  if "|" in sproc_export_file_name:
@@ -622,7 +635,7 @@ class GradientBoostingRegressor(BaseTransformer):
622
635
  print("\n".join(fields[1:]))
623
636
 
624
637
  session.file.get(
625
- os.path.join(stage_result_file_name, sproc_export_file_name),
638
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
626
639
  local_result_file_name,
627
640
  statement_params=statement_params
628
641
  )
@@ -668,7 +681,7 @@ class GradientBoostingRegressor(BaseTransformer):
668
681
 
669
682
  # Register vectorized UDF for batch inference
670
683
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
671
- safe_id=self.id, method=inference_method)
684
+ safe_id=self._get_rand_id(), method=inference_method)
672
685
 
673
686
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
674
687
  # will try to pickle all of self which fails.
@@ -760,7 +773,7 @@ class GradientBoostingRegressor(BaseTransformer):
760
773
  return transformed_pandas_df.to_dict("records")
761
774
 
762
775
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
763
- safe_id=self.id
776
+ safe_id=self._get_rand_id()
764
777
  )
765
778
 
766
779
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -816,26 +829,37 @@ class GradientBoostingRegressor(BaseTransformer):
816
829
  # input cols need to match unquoted / quoted
817
830
  input_cols = self.input_cols
818
831
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
832
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
819
833
 
820
834
  estimator = self._sklearn_object
821
835
 
822
- input_df = dataset[input_cols] # Select input columns with quoted column names.
823
- if hasattr(estimator, "feature_names_in_"):
824
- missing_features = []
825
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
826
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
827
- missing_features.append(f)
828
-
829
- if len(missing_features) > 0:
830
- raise ValueError(
831
- "The feature names should match with those that were passed during fit.\n"
832
- f"Features seen during fit call but not present in the input: {missing_features}\n"
833
- f"Features in the input dataframe : {input_cols}\n"
834
- )
835
- input_df.columns = getattr(estimator, "feature_names_in_")
836
- else:
837
- # Just rename the column names to unquoted identifiers.
838
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
836
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
837
+ missing_features = []
838
+ features_in_dataset = set(dataset.columns)
839
+ columns_to_select = []
840
+ for i, f in enumerate(features_required_by_estimator):
841
+ if (
842
+ i >= len(input_cols)
843
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
844
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
845
+ and quoted_input_cols[i] not in features_in_dataset)
846
+ ):
847
+ missing_features.append(f)
848
+ elif input_cols[i] in features_in_dataset:
849
+ columns_to_select.append(input_cols[i])
850
+ elif unquoted_input_cols[i] in features_in_dataset:
851
+ columns_to_select.append(unquoted_input_cols[i])
852
+ else:
853
+ columns_to_select.append(quoted_input_cols[i])
854
+
855
+ if len(missing_features) > 0:
856
+ raise ValueError(
857
+ "The feature names should match with those that were passed during fit.\n"
858
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
859
+ f"Features in the input dataframe : {input_cols}\n"
860
+ )
861
+ input_df = dataset[columns_to_select]
862
+ input_df.columns = features_required_by_estimator
839
863
 
840
864
  transformed_numpy_array = getattr(estimator, inference_method)(
841
865
  input_df
@@ -916,11 +940,18 @@ class GradientBoostingRegressor(BaseTransformer):
916
940
  Transformed dataset.
917
941
  """
918
942
  if isinstance(dataset, DataFrame):
943
+ expected_type_inferred = "float"
944
+ # when it is classifier, infer the datatype from label columns
945
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
946
+ expected_type_inferred = convert_sp_to_sf_type(
947
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
948
+ )
949
+
919
950
  output_df = self._batch_inference(
920
951
  dataset=dataset,
921
952
  inference_method="predict",
922
953
  expected_output_cols_list=self.output_cols,
923
- expected_output_cols_type="float",
954
+ expected_output_cols_type=expected_type_inferred,
924
955
  )
925
956
  elif isinstance(dataset, pd.DataFrame):
926
957
  output_df = self._sklearn_inference(
@@ -991,10 +1022,10 @@ class GradientBoostingRegressor(BaseTransformer):
991
1022
 
992
1023
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
993
1024
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
994
- Returns an empty list if current object is not a classifier or not yet fitted.
1025
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
995
1026
  """
996
1027
  if getattr(self._sklearn_object, "classes_", None) is None:
997
- return []
1028
+ return [output_cols_prefix]
998
1029
 
999
1030
  classes = self._sklearn_object.classes_
1000
1031
  if isinstance(classes, numpy.ndarray):
@@ -1219,7 +1250,7 @@ class GradientBoostingRegressor(BaseTransformer):
1219
1250
  cp.dump(self._sklearn_object, local_score_file)
1220
1251
 
1221
1252
  # Create temp stage to run score.
1222
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1253
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1223
1254
  session = dataset._session
1224
1255
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1225
1256
  SqlResultValidator(
@@ -1233,8 +1264,9 @@ class GradientBoostingRegressor(BaseTransformer):
1233
1264
  expected_value=f"Stage area {score_stage_name} successfully created."
1234
1265
  ).validate()
1235
1266
 
1236
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1237
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1267
+ # Use posixpath to construct stage paths
1268
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1269
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1238
1270
  statement_params = telemetry.get_function_usage_statement_params(
1239
1271
  project=_PROJECT,
1240
1272
  subproject=_SUBPROJECT,
@@ -1260,6 +1292,7 @@ class GradientBoostingRegressor(BaseTransformer):
1260
1292
  replace=True,
1261
1293
  session=session,
1262
1294
  statement_params=statement_params,
1295
+ anonymous=True
1263
1296
  )
1264
1297
  def score_wrapper_sproc(
1265
1298
  session: Session,
@@ -1267,7 +1300,8 @@ class GradientBoostingRegressor(BaseTransformer):
1267
1300
  stage_score_file_name: str,
1268
1301
  input_cols: List[str],
1269
1302
  label_cols: List[str],
1270
- sample_weight_col: Optional[str]
1303
+ sample_weight_col: Optional[str],
1304
+ statement_params: Dict[str, str]
1271
1305
  ) -> float:
1272
1306
  import cloudpickle as cp
1273
1307
  import numpy as np
@@ -1317,14 +1351,14 @@ class GradientBoostingRegressor(BaseTransformer):
1317
1351
  api_calls=[Session.call],
1318
1352
  custom_tags=dict([("autogen", True)]),
1319
1353
  )
1320
- score = session.call(
1321
- score_sproc_name,
1354
+ score = score_wrapper_sproc(
1355
+ session,
1322
1356
  query,
1323
1357
  stage_score_file_name,
1324
1358
  identifier.get_unescaped_names(self.input_cols),
1325
1359
  identifier.get_unescaped_names(self.label_cols),
1326
1360
  identifier.get_unescaped_names(self.sample_weight_col),
1327
- statement_params=statement_params,
1361
+ statement_params,
1328
1362
  )
1329
1363
 
1330
1364
  cleanup_temp_files([local_score_file_name])
@@ -1342,18 +1376,20 @@ class GradientBoostingRegressor(BaseTransformer):
1342
1376
  if self._sklearn_object._estimator_type == 'classifier':
1343
1377
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1344
1378
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1345
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1379
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1380
+ ([] if self._drop_input_cols else inputs) + outputs)
1346
1381
  # For regressor, the type of predict is float64
1347
1382
  elif self._sklearn_object._estimator_type == 'regressor':
1348
1383
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1349
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1350
-
1384
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1385
+ ([] if self._drop_input_cols else inputs) + outputs)
1351
1386
  for prob_func in PROB_FUNCTIONS:
1352
1387
  if hasattr(self, prob_func):
1353
1388
  output_cols_prefix: str = f"{prob_func}_"
1354
1389
  output_column_names = self._get_output_column_names(output_cols_prefix)
1355
1390
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1356
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1391
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1392
+ ([] if self._drop_input_cols else inputs) + outputs)
1357
1393
 
1358
1394
  @property
1359
1395
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -338,7 +340,6 @@ class HistGradientBoostingClassifier(BaseTransformer):
338
340
  sample_weight_col: Optional[str] = None,
339
341
  ) -> None:
340
342
  super().__init__()
341
- self.id = str(uuid4()).replace("-", "_").upper()
342
343
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
343
344
 
344
345
  self._deps = list(deps)
@@ -377,6 +378,15 @@ class HistGradientBoostingClassifier(BaseTransformer):
377
378
  self.set_drop_input_cols(drop_input_cols)
378
379
  self.set_sample_weight_col(sample_weight_col)
379
380
 
381
+ def _get_rand_id(self) -> str:
382
+ """
383
+ Generate random id to be used in sproc and stage names.
384
+
385
+ Returns:
386
+ Random id string usable in sproc, table, and stage names.
387
+ """
388
+ return str(uuid4()).replace("-", "_").upper()
389
+
380
390
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
381
391
  """
382
392
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -455,7 +465,7 @@ class HistGradientBoostingClassifier(BaseTransformer):
455
465
  cp.dump(self._sklearn_object, local_transform_file)
456
466
 
457
467
  # Create temp stage to run fit.
458
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
468
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
459
469
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
460
470
  SqlResultValidator(
461
471
  session=session,
@@ -468,11 +478,12 @@ class HistGradientBoostingClassifier(BaseTransformer):
468
478
  expected_value=f"Stage area {transform_stage_name} successfully created."
469
479
  ).validate()
470
480
 
471
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
481
+ # Use posixpath to construct stage paths
482
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
483
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
472
484
  local_result_file_name = get_temp_file_path()
473
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
474
485
 
475
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
486
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
476
487
  statement_params = telemetry.get_function_usage_statement_params(
477
488
  project=_PROJECT,
478
489
  subproject=_SUBPROJECT,
@@ -498,6 +509,7 @@ class HistGradientBoostingClassifier(BaseTransformer):
498
509
  replace=True,
499
510
  session=session,
500
511
  statement_params=statement_params,
512
+ anonymous=True
501
513
  )
502
514
  def fit_wrapper_sproc(
503
515
  session: Session,
@@ -506,7 +518,8 @@ class HistGradientBoostingClassifier(BaseTransformer):
506
518
  stage_result_file_name: str,
507
519
  input_cols: List[str],
508
520
  label_cols: List[str],
509
- sample_weight_col: Optional[str]
521
+ sample_weight_col: Optional[str],
522
+ statement_params: Dict[str, str]
510
523
  ) -> str:
511
524
  import cloudpickle as cp
512
525
  import numpy as np
@@ -573,15 +586,15 @@ class HistGradientBoostingClassifier(BaseTransformer):
573
586
  api_calls=[Session.call],
574
587
  custom_tags=dict([("autogen", True)]),
575
588
  )
576
- sproc_export_file_name = session.call(
577
- fit_sproc_name,
589
+ sproc_export_file_name = fit_wrapper_sproc(
590
+ session,
578
591
  query,
579
592
  stage_transform_file_name,
580
593
  stage_result_file_name,
581
594
  identifier.get_unescaped_names(self.input_cols),
582
595
  identifier.get_unescaped_names(self.label_cols),
583
596
  identifier.get_unescaped_names(self.sample_weight_col),
584
- statement_params=statement_params,
597
+ statement_params,
585
598
  )
586
599
 
587
600
  if "|" in sproc_export_file_name:
@@ -591,7 +604,7 @@ class HistGradientBoostingClassifier(BaseTransformer):
591
604
  print("\n".join(fields[1:]))
592
605
 
593
606
  session.file.get(
594
- os.path.join(stage_result_file_name, sproc_export_file_name),
607
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
595
608
  local_result_file_name,
596
609
  statement_params=statement_params
597
610
  )
@@ -637,7 +650,7 @@ class HistGradientBoostingClassifier(BaseTransformer):
637
650
 
638
651
  # Register vectorized UDF for batch inference
639
652
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
640
- safe_id=self.id, method=inference_method)
653
+ safe_id=self._get_rand_id(), method=inference_method)
641
654
 
642
655
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
643
656
  # will try to pickle all of self which fails.
@@ -729,7 +742,7 @@ class HistGradientBoostingClassifier(BaseTransformer):
729
742
  return transformed_pandas_df.to_dict("records")
730
743
 
731
744
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
732
- safe_id=self.id
745
+ safe_id=self._get_rand_id()
733
746
  )
734
747
 
735
748
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -785,26 +798,37 @@ class HistGradientBoostingClassifier(BaseTransformer):
785
798
  # input cols need to match unquoted / quoted
786
799
  input_cols = self.input_cols
787
800
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
801
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
788
802
 
789
803
  estimator = self._sklearn_object
790
804
 
791
- input_df = dataset[input_cols] # Select input columns with quoted column names.
792
- if hasattr(estimator, "feature_names_in_"):
793
- missing_features = []
794
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
795
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
796
- missing_features.append(f)
797
-
798
- if len(missing_features) > 0:
799
- raise ValueError(
800
- "The feature names should match with those that were passed during fit.\n"
801
- f"Features seen during fit call but not present in the input: {missing_features}\n"
802
- f"Features in the input dataframe : {input_cols}\n"
803
- )
804
- input_df.columns = getattr(estimator, "feature_names_in_")
805
- else:
806
- # Just rename the column names to unquoted identifiers.
807
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
805
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
806
+ missing_features = []
807
+ features_in_dataset = set(dataset.columns)
808
+ columns_to_select = []
809
+ for i, f in enumerate(features_required_by_estimator):
810
+ if (
811
+ i >= len(input_cols)
812
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
813
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
814
+ and quoted_input_cols[i] not in features_in_dataset)
815
+ ):
816
+ missing_features.append(f)
817
+ elif input_cols[i] in features_in_dataset:
818
+ columns_to_select.append(input_cols[i])
819
+ elif unquoted_input_cols[i] in features_in_dataset:
820
+ columns_to_select.append(unquoted_input_cols[i])
821
+ else:
822
+ columns_to_select.append(quoted_input_cols[i])
823
+
824
+ if len(missing_features) > 0:
825
+ raise ValueError(
826
+ "The feature names should match with those that were passed during fit.\n"
827
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
828
+ f"Features in the input dataframe : {input_cols}\n"
829
+ )
830
+ input_df = dataset[columns_to_select]
831
+ input_df.columns = features_required_by_estimator
808
832
 
809
833
  transformed_numpy_array = getattr(estimator, inference_method)(
810
834
  input_df
@@ -885,11 +909,18 @@ class HistGradientBoostingClassifier(BaseTransformer):
885
909
  Transformed dataset.
886
910
  """
887
911
  if isinstance(dataset, DataFrame):
912
+ expected_type_inferred = ""
913
+ # when it is classifier, infer the datatype from label columns
914
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
915
+ expected_type_inferred = convert_sp_to_sf_type(
916
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
917
+ )
918
+
888
919
  output_df = self._batch_inference(
889
920
  dataset=dataset,
890
921
  inference_method="predict",
891
922
  expected_output_cols_list=self.output_cols,
892
- expected_output_cols_type="",
923
+ expected_output_cols_type=expected_type_inferred,
893
924
  )
894
925
  elif isinstance(dataset, pd.DataFrame):
895
926
  output_df = self._sklearn_inference(
@@ -960,10 +991,10 @@ class HistGradientBoostingClassifier(BaseTransformer):
960
991
 
961
992
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
962
993
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
963
- Returns an empty list if current object is not a classifier or not yet fitted.
994
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
964
995
  """
965
996
  if getattr(self._sklearn_object, "classes_", None) is None:
966
- return []
997
+ return [output_cols_prefix]
967
998
 
968
999
  classes = self._sklearn_object.classes_
969
1000
  if isinstance(classes, numpy.ndarray):
@@ -1194,7 +1225,7 @@ class HistGradientBoostingClassifier(BaseTransformer):
1194
1225
  cp.dump(self._sklearn_object, local_score_file)
1195
1226
 
1196
1227
  # Create temp stage to run score.
1197
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1228
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1198
1229
  session = dataset._session
1199
1230
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1200
1231
  SqlResultValidator(
@@ -1208,8 +1239,9 @@ class HistGradientBoostingClassifier(BaseTransformer):
1208
1239
  expected_value=f"Stage area {score_stage_name} successfully created."
1209
1240
  ).validate()
1210
1241
 
1211
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1212
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1242
+ # Use posixpath to construct stage paths
1243
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1244
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1213
1245
  statement_params = telemetry.get_function_usage_statement_params(
1214
1246
  project=_PROJECT,
1215
1247
  subproject=_SUBPROJECT,
@@ -1235,6 +1267,7 @@ class HistGradientBoostingClassifier(BaseTransformer):
1235
1267
  replace=True,
1236
1268
  session=session,
1237
1269
  statement_params=statement_params,
1270
+ anonymous=True
1238
1271
  )
1239
1272
  def score_wrapper_sproc(
1240
1273
  session: Session,
@@ -1242,7 +1275,8 @@ class HistGradientBoostingClassifier(BaseTransformer):
1242
1275
  stage_score_file_name: str,
1243
1276
  input_cols: List[str],
1244
1277
  label_cols: List[str],
1245
- sample_weight_col: Optional[str]
1278
+ sample_weight_col: Optional[str],
1279
+ statement_params: Dict[str, str]
1246
1280
  ) -> float:
1247
1281
  import cloudpickle as cp
1248
1282
  import numpy as np
@@ -1292,14 +1326,14 @@ class HistGradientBoostingClassifier(BaseTransformer):
1292
1326
  api_calls=[Session.call],
1293
1327
  custom_tags=dict([("autogen", True)]),
1294
1328
  )
1295
- score = session.call(
1296
- score_sproc_name,
1329
+ score = score_wrapper_sproc(
1330
+ session,
1297
1331
  query,
1298
1332
  stage_score_file_name,
1299
1333
  identifier.get_unescaped_names(self.input_cols),
1300
1334
  identifier.get_unescaped_names(self.label_cols),
1301
1335
  identifier.get_unescaped_names(self.sample_weight_col),
1302
- statement_params=statement_params,
1336
+ statement_params,
1303
1337
  )
1304
1338
 
1305
1339
  cleanup_temp_files([local_score_file_name])
@@ -1317,18 +1351,20 @@ class HistGradientBoostingClassifier(BaseTransformer):
1317
1351
  if self._sklearn_object._estimator_type == 'classifier':
1318
1352
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1319
1353
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1320
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1354
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1355
+ ([] if self._drop_input_cols else inputs) + outputs)
1321
1356
  # For regressor, the type of predict is float64
1322
1357
  elif self._sklearn_object._estimator_type == 'regressor':
1323
1358
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1324
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1325
-
1359
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1360
+ ([] if self._drop_input_cols else inputs) + outputs)
1326
1361
  for prob_func in PROB_FUNCTIONS:
1327
1362
  if hasattr(self, prob_func):
1328
1363
  output_cols_prefix: str = f"{prob_func}_"
1329
1364
  output_column_names = self._get_output_column_names(output_cols_prefix)
1330
1365
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1331
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1366
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1367
+ ([] if self._drop_input_cols else inputs) + outputs)
1332
1368
 
1333
1369
  @property
1334
1370
  def model_signatures(self) -> Dict[str, ModelSignature]: