snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -26,6 +27,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
26
27
  from snowflake.snowpark import DataFrame, Session
27
28
  from snowflake.snowpark.functions import pandas_udf, sproc
28
29
  from snowflake.snowpark.types import PandasSeries
30
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
29
31
 
30
32
  from snowflake.ml.model.model_signature import (
31
33
  DataType,
@@ -390,7 +392,6 @@ class XGBClassifier(BaseTransformer):
390
392
  **kwargs,
391
393
  ) -> None:
392
394
  super().__init__()
393
- self.id = str(uuid4()).replace("-", "_").upper()
394
395
  deps: Set[str] = set([f'numpy=={np.__version__}', f'xgboost=={xgboost.__version__}', f'cloudpickle=={cp.__version__}'])
395
396
 
396
397
  self._deps = list(deps)
@@ -412,6 +413,15 @@ class XGBClassifier(BaseTransformer):
412
413
  self.set_drop_input_cols(drop_input_cols)
413
414
  self.set_sample_weight_col(sample_weight_col)
414
415
 
416
+ def _get_rand_id(self) -> str:
417
+ """
418
+ Generate random id to be used in sproc and stage names.
419
+
420
+ Returns:
421
+ Random id string usable in sproc, table, and stage names.
422
+ """
423
+ return str(uuid4()).replace("-", "_").upper()
424
+
415
425
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
416
426
  """
417
427
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -490,7 +500,7 @@ class XGBClassifier(BaseTransformer):
490
500
  cp.dump(self._sklearn_object, local_transform_file)
491
501
 
492
502
  # Create temp stage to run fit.
493
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
503
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
494
504
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
495
505
  SqlResultValidator(
496
506
  session=session,
@@ -503,11 +513,12 @@ class XGBClassifier(BaseTransformer):
503
513
  expected_value=f"Stage area {transform_stage_name} successfully created."
504
514
  ).validate()
505
515
 
506
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
516
+ # Use posixpath to construct stage paths
517
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
518
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
507
519
  local_result_file_name = get_temp_file_path()
508
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
509
520
 
510
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
521
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
511
522
  statement_params = telemetry.get_function_usage_statement_params(
512
523
  project=_PROJECT,
513
524
  subproject=_SUBPROJECT,
@@ -533,6 +544,7 @@ class XGBClassifier(BaseTransformer):
533
544
  replace=True,
534
545
  session=session,
535
546
  statement_params=statement_params,
547
+ anonymous=True
536
548
  )
537
549
  def fit_wrapper_sproc(
538
550
  session: Session,
@@ -541,7 +553,8 @@ class XGBClassifier(BaseTransformer):
541
553
  stage_result_file_name: str,
542
554
  input_cols: List[str],
543
555
  label_cols: List[str],
544
- sample_weight_col: Optional[str]
556
+ sample_weight_col: Optional[str],
557
+ statement_params: Dict[str, str]
545
558
  ) -> str:
546
559
  import cloudpickle as cp
547
560
  import numpy as np
@@ -608,15 +621,15 @@ class XGBClassifier(BaseTransformer):
608
621
  api_calls=[Session.call],
609
622
  custom_tags=dict([("autogen", True)]),
610
623
  )
611
- sproc_export_file_name = session.call(
612
- fit_sproc_name,
624
+ sproc_export_file_name = fit_wrapper_sproc(
625
+ session,
613
626
  query,
614
627
  stage_transform_file_name,
615
628
  stage_result_file_name,
616
629
  identifier.get_unescaped_names(self.input_cols),
617
630
  identifier.get_unescaped_names(self.label_cols),
618
631
  identifier.get_unescaped_names(self.sample_weight_col),
619
- statement_params=statement_params,
632
+ statement_params,
620
633
  )
621
634
 
622
635
  if "|" in sproc_export_file_name:
@@ -626,7 +639,7 @@ class XGBClassifier(BaseTransformer):
626
639
  print("\n".join(fields[1:]))
627
640
 
628
641
  session.file.get(
629
- os.path.join(stage_result_file_name, sproc_export_file_name),
642
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
630
643
  local_result_file_name,
631
644
  statement_params=statement_params
632
645
  )
@@ -672,7 +685,7 @@ class XGBClassifier(BaseTransformer):
672
685
 
673
686
  # Register vectorized UDF for batch inference
674
687
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
675
- safe_id=self.id, method=inference_method)
688
+ safe_id=self._get_rand_id(), method=inference_method)
676
689
 
677
690
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
678
691
  # will try to pickle all of self which fails.
@@ -764,7 +777,7 @@ class XGBClassifier(BaseTransformer):
764
777
  return transformed_pandas_df.to_dict("records")
765
778
 
766
779
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
767
- safe_id=self.id
780
+ safe_id=self._get_rand_id()
768
781
  )
769
782
 
770
783
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -820,26 +833,37 @@ class XGBClassifier(BaseTransformer):
820
833
  # input cols need to match unquoted / quoted
821
834
  input_cols = self.input_cols
822
835
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
836
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
823
837
 
824
838
  estimator = self._sklearn_object
825
839
 
826
- input_df = dataset[input_cols] # Select input columns with quoted column names.
827
- if hasattr(estimator, "feature_names_in_"):
828
- missing_features = []
829
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
830
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
831
- missing_features.append(f)
832
-
833
- if len(missing_features) > 0:
834
- raise ValueError(
835
- "The feature names should match with those that were passed during fit.\n"
836
- f"Features seen during fit call but not present in the input: {missing_features}\n"
837
- f"Features in the input dataframe : {input_cols}\n"
838
- )
839
- input_df.columns = getattr(estimator, "feature_names_in_")
840
- else:
841
- # Just rename the column names to unquoted identifiers.
842
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
840
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
841
+ missing_features = []
842
+ features_in_dataset = set(dataset.columns)
843
+ columns_to_select = []
844
+ for i, f in enumerate(features_required_by_estimator):
845
+ if (
846
+ i >= len(input_cols)
847
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
848
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
849
+ and quoted_input_cols[i] not in features_in_dataset)
850
+ ):
851
+ missing_features.append(f)
852
+ elif input_cols[i] in features_in_dataset:
853
+ columns_to_select.append(input_cols[i])
854
+ elif unquoted_input_cols[i] in features_in_dataset:
855
+ columns_to_select.append(unquoted_input_cols[i])
856
+ else:
857
+ columns_to_select.append(quoted_input_cols[i])
858
+
859
+ if len(missing_features) > 0:
860
+ raise ValueError(
861
+ "The feature names should match with those that were passed during fit.\n"
862
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
863
+ f"Features in the input dataframe : {input_cols}\n"
864
+ )
865
+ input_df = dataset[columns_to_select]
866
+ input_df.columns = features_required_by_estimator
843
867
 
844
868
  transformed_numpy_array = getattr(estimator, inference_method)(
845
869
  input_df
@@ -920,11 +944,18 @@ class XGBClassifier(BaseTransformer):
920
944
  Transformed dataset.
921
945
  """
922
946
  if isinstance(dataset, DataFrame):
947
+ expected_type_inferred = ""
948
+ # when it is classifier, infer the datatype from label columns
949
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
950
+ expected_type_inferred = convert_sp_to_sf_type(
951
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
952
+ )
953
+
923
954
  output_df = self._batch_inference(
924
955
  dataset=dataset,
925
956
  inference_method="predict",
926
957
  expected_output_cols_list=self.output_cols,
927
- expected_output_cols_type="",
958
+ expected_output_cols_type=expected_type_inferred,
928
959
  )
929
960
  elif isinstance(dataset, pd.DataFrame):
930
961
  output_df = self._sklearn_inference(
@@ -995,10 +1026,10 @@ class XGBClassifier(BaseTransformer):
995
1026
 
996
1027
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
997
1028
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
998
- Returns an empty list if current object is not a classifier or not yet fitted.
1029
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
999
1030
  """
1000
1031
  if getattr(self._sklearn_object, "classes_", None) is None:
1001
- return []
1032
+ return [output_cols_prefix]
1002
1033
 
1003
1034
  classes = self._sklearn_object.classes_
1004
1035
  if isinstance(classes, numpy.ndarray):
@@ -1227,7 +1258,7 @@ class XGBClassifier(BaseTransformer):
1227
1258
  cp.dump(self._sklearn_object, local_score_file)
1228
1259
 
1229
1260
  # Create temp stage to run score.
1230
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1261
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1231
1262
  session = dataset._session
1232
1263
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1233
1264
  SqlResultValidator(
@@ -1241,8 +1272,9 @@ class XGBClassifier(BaseTransformer):
1241
1272
  expected_value=f"Stage area {score_stage_name} successfully created."
1242
1273
  ).validate()
1243
1274
 
1244
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1245
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1275
+ # Use posixpath to construct stage paths
1276
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1277
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1246
1278
  statement_params = telemetry.get_function_usage_statement_params(
1247
1279
  project=_PROJECT,
1248
1280
  subproject=_SUBPROJECT,
@@ -1268,6 +1300,7 @@ class XGBClassifier(BaseTransformer):
1268
1300
  replace=True,
1269
1301
  session=session,
1270
1302
  statement_params=statement_params,
1303
+ anonymous=True
1271
1304
  )
1272
1305
  def score_wrapper_sproc(
1273
1306
  session: Session,
@@ -1275,7 +1308,8 @@ class XGBClassifier(BaseTransformer):
1275
1308
  stage_score_file_name: str,
1276
1309
  input_cols: List[str],
1277
1310
  label_cols: List[str],
1278
- sample_weight_col: Optional[str]
1311
+ sample_weight_col: Optional[str],
1312
+ statement_params: Dict[str, str]
1279
1313
  ) -> float:
1280
1314
  import cloudpickle as cp
1281
1315
  import numpy as np
@@ -1325,14 +1359,14 @@ class XGBClassifier(BaseTransformer):
1325
1359
  api_calls=[Session.call],
1326
1360
  custom_tags=dict([("autogen", True)]),
1327
1361
  )
1328
- score = session.call(
1329
- score_sproc_name,
1362
+ score = score_wrapper_sproc(
1363
+ session,
1330
1364
  query,
1331
1365
  stage_score_file_name,
1332
1366
  identifier.get_unescaped_names(self.input_cols),
1333
1367
  identifier.get_unescaped_names(self.label_cols),
1334
1368
  identifier.get_unescaped_names(self.sample_weight_col),
1335
- statement_params=statement_params,
1369
+ statement_params,
1336
1370
  )
1337
1371
 
1338
1372
  cleanup_temp_files([local_score_file_name])
@@ -1350,18 +1384,20 @@ class XGBClassifier(BaseTransformer):
1350
1384
  if self._sklearn_object._estimator_type == 'classifier':
1351
1385
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1352
1386
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1353
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1387
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1388
+ ([] if self._drop_input_cols else inputs) + outputs)
1354
1389
  # For regressor, the type of predict is float64
1355
1390
  elif self._sklearn_object._estimator_type == 'regressor':
1356
1391
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1357
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1358
-
1392
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1393
+ ([] if self._drop_input_cols else inputs) + outputs)
1359
1394
  for prob_func in PROB_FUNCTIONS:
1360
1395
  if hasattr(self, prob_func):
1361
1396
  output_cols_prefix: str = f"{prob_func}_"
1362
1397
  output_column_names = self._get_output_column_names(output_cols_prefix)
1363
1398
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1364
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1399
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1400
+ ([] if self._drop_input_cols else inputs) + outputs)
1365
1401
 
1366
1402
  @property
1367
1403
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -26,6 +27,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
26
27
  from snowflake.snowpark import DataFrame, Session
27
28
  from snowflake.snowpark.functions import pandas_udf, sproc
28
29
  from snowflake.snowpark.types import PandasSeries
30
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
29
31
 
30
32
  from snowflake.ml.model.model_signature import (
31
33
  DataType,
@@ -390,7 +392,6 @@ class XGBRegressor(BaseTransformer):
390
392
  **kwargs,
391
393
  ) -> None:
392
394
  super().__init__()
393
- self.id = str(uuid4()).replace("-", "_").upper()
394
395
  deps: Set[str] = set([f'numpy=={np.__version__}', f'xgboost=={xgboost.__version__}', f'cloudpickle=={cp.__version__}'])
395
396
 
396
397
  self._deps = list(deps)
@@ -411,6 +412,15 @@ class XGBRegressor(BaseTransformer):
411
412
  self.set_drop_input_cols(drop_input_cols)
412
413
  self.set_sample_weight_col(sample_weight_col)
413
414
 
415
+ def _get_rand_id(self) -> str:
416
+ """
417
+ Generate random id to be used in sproc and stage names.
418
+
419
+ Returns:
420
+ Random id string usable in sproc, table, and stage names.
421
+ """
422
+ return str(uuid4()).replace("-", "_").upper()
423
+
414
424
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
415
425
  """
416
426
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -489,7 +499,7 @@ class XGBRegressor(BaseTransformer):
489
499
  cp.dump(self._sklearn_object, local_transform_file)
490
500
 
491
501
  # Create temp stage to run fit.
492
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
502
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
493
503
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
494
504
  SqlResultValidator(
495
505
  session=session,
@@ -502,11 +512,12 @@ class XGBRegressor(BaseTransformer):
502
512
  expected_value=f"Stage area {transform_stage_name} successfully created."
503
513
  ).validate()
504
514
 
505
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
515
+ # Use posixpath to construct stage paths
516
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
517
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
506
518
  local_result_file_name = get_temp_file_path()
507
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
508
519
 
509
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
520
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
510
521
  statement_params = telemetry.get_function_usage_statement_params(
511
522
  project=_PROJECT,
512
523
  subproject=_SUBPROJECT,
@@ -532,6 +543,7 @@ class XGBRegressor(BaseTransformer):
532
543
  replace=True,
533
544
  session=session,
534
545
  statement_params=statement_params,
546
+ anonymous=True
535
547
  )
536
548
  def fit_wrapper_sproc(
537
549
  session: Session,
@@ -540,7 +552,8 @@ class XGBRegressor(BaseTransformer):
540
552
  stage_result_file_name: str,
541
553
  input_cols: List[str],
542
554
  label_cols: List[str],
543
- sample_weight_col: Optional[str]
555
+ sample_weight_col: Optional[str],
556
+ statement_params: Dict[str, str]
544
557
  ) -> str:
545
558
  import cloudpickle as cp
546
559
  import numpy as np
@@ -607,15 +620,15 @@ class XGBRegressor(BaseTransformer):
607
620
  api_calls=[Session.call],
608
621
  custom_tags=dict([("autogen", True)]),
609
622
  )
610
- sproc_export_file_name = session.call(
611
- fit_sproc_name,
623
+ sproc_export_file_name = fit_wrapper_sproc(
624
+ session,
612
625
  query,
613
626
  stage_transform_file_name,
614
627
  stage_result_file_name,
615
628
  identifier.get_unescaped_names(self.input_cols),
616
629
  identifier.get_unescaped_names(self.label_cols),
617
630
  identifier.get_unescaped_names(self.sample_weight_col),
618
- statement_params=statement_params,
631
+ statement_params,
619
632
  )
620
633
 
621
634
  if "|" in sproc_export_file_name:
@@ -625,7 +638,7 @@ class XGBRegressor(BaseTransformer):
625
638
  print("\n".join(fields[1:]))
626
639
 
627
640
  session.file.get(
628
- os.path.join(stage_result_file_name, sproc_export_file_name),
641
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
629
642
  local_result_file_name,
630
643
  statement_params=statement_params
631
644
  )
@@ -671,7 +684,7 @@ class XGBRegressor(BaseTransformer):
671
684
 
672
685
  # Register vectorized UDF for batch inference
673
686
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
674
- safe_id=self.id, method=inference_method)
687
+ safe_id=self._get_rand_id(), method=inference_method)
675
688
 
676
689
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
677
690
  # will try to pickle all of self which fails.
@@ -763,7 +776,7 @@ class XGBRegressor(BaseTransformer):
763
776
  return transformed_pandas_df.to_dict("records")
764
777
 
765
778
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
766
- safe_id=self.id
779
+ safe_id=self._get_rand_id()
767
780
  )
768
781
 
769
782
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -819,26 +832,37 @@ class XGBRegressor(BaseTransformer):
819
832
  # input cols need to match unquoted / quoted
820
833
  input_cols = self.input_cols
821
834
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
835
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
822
836
 
823
837
  estimator = self._sklearn_object
824
838
 
825
- input_df = dataset[input_cols] # Select input columns with quoted column names.
826
- if hasattr(estimator, "feature_names_in_"):
827
- missing_features = []
828
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
829
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
830
- missing_features.append(f)
831
-
832
- if len(missing_features) > 0:
833
- raise ValueError(
834
- "The feature names should match with those that were passed during fit.\n"
835
- f"Features seen during fit call but not present in the input: {missing_features}\n"
836
- f"Features in the input dataframe : {input_cols}\n"
837
- )
838
- input_df.columns = getattr(estimator, "feature_names_in_")
839
- else:
840
- # Just rename the column names to unquoted identifiers.
841
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
839
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
840
+ missing_features = []
841
+ features_in_dataset = set(dataset.columns)
842
+ columns_to_select = []
843
+ for i, f in enumerate(features_required_by_estimator):
844
+ if (
845
+ i >= len(input_cols)
846
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
847
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
848
+ and quoted_input_cols[i] not in features_in_dataset)
849
+ ):
850
+ missing_features.append(f)
851
+ elif input_cols[i] in features_in_dataset:
852
+ columns_to_select.append(input_cols[i])
853
+ elif unquoted_input_cols[i] in features_in_dataset:
854
+ columns_to_select.append(unquoted_input_cols[i])
855
+ else:
856
+ columns_to_select.append(quoted_input_cols[i])
857
+
858
+ if len(missing_features) > 0:
859
+ raise ValueError(
860
+ "The feature names should match with those that were passed during fit.\n"
861
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
862
+ f"Features in the input dataframe : {input_cols}\n"
863
+ )
864
+ input_df = dataset[columns_to_select]
865
+ input_df.columns = features_required_by_estimator
842
866
 
843
867
  transformed_numpy_array = getattr(estimator, inference_method)(
844
868
  input_df
@@ -919,11 +943,18 @@ class XGBRegressor(BaseTransformer):
919
943
  Transformed dataset.
920
944
  """
921
945
  if isinstance(dataset, DataFrame):
946
+ expected_type_inferred = "float"
947
+ # when it is classifier, infer the datatype from label columns
948
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
949
+ expected_type_inferred = convert_sp_to_sf_type(
950
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
951
+ )
952
+
922
953
  output_df = self._batch_inference(
923
954
  dataset=dataset,
924
955
  inference_method="predict",
925
956
  expected_output_cols_list=self.output_cols,
926
- expected_output_cols_type="float",
957
+ expected_output_cols_type=expected_type_inferred,
927
958
  )
928
959
  elif isinstance(dataset, pd.DataFrame):
929
960
  output_df = self._sklearn_inference(
@@ -994,10 +1025,10 @@ class XGBRegressor(BaseTransformer):
994
1025
 
995
1026
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
996
1027
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
997
- Returns an empty list if current object is not a classifier or not yet fitted.
1028
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
998
1029
  """
999
1030
  if getattr(self._sklearn_object, "classes_", None) is None:
1000
- return []
1031
+ return [output_cols_prefix]
1001
1032
 
1002
1033
  classes = self._sklearn_object.classes_
1003
1034
  if isinstance(classes, numpy.ndarray):
@@ -1222,7 +1253,7 @@ class XGBRegressor(BaseTransformer):
1222
1253
  cp.dump(self._sklearn_object, local_score_file)
1223
1254
 
1224
1255
  # Create temp stage to run score.
1225
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1256
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1226
1257
  session = dataset._session
1227
1258
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1228
1259
  SqlResultValidator(
@@ -1236,8 +1267,9 @@ class XGBRegressor(BaseTransformer):
1236
1267
  expected_value=f"Stage area {score_stage_name} successfully created."
1237
1268
  ).validate()
1238
1269
 
1239
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1240
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1270
+ # Use posixpath to construct stage paths
1271
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1272
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1241
1273
  statement_params = telemetry.get_function_usage_statement_params(
1242
1274
  project=_PROJECT,
1243
1275
  subproject=_SUBPROJECT,
@@ -1263,6 +1295,7 @@ class XGBRegressor(BaseTransformer):
1263
1295
  replace=True,
1264
1296
  session=session,
1265
1297
  statement_params=statement_params,
1298
+ anonymous=True
1266
1299
  )
1267
1300
  def score_wrapper_sproc(
1268
1301
  session: Session,
@@ -1270,7 +1303,8 @@ class XGBRegressor(BaseTransformer):
1270
1303
  stage_score_file_name: str,
1271
1304
  input_cols: List[str],
1272
1305
  label_cols: List[str],
1273
- sample_weight_col: Optional[str]
1306
+ sample_weight_col: Optional[str],
1307
+ statement_params: Dict[str, str]
1274
1308
  ) -> float:
1275
1309
  import cloudpickle as cp
1276
1310
  import numpy as np
@@ -1320,14 +1354,14 @@ class XGBRegressor(BaseTransformer):
1320
1354
  api_calls=[Session.call],
1321
1355
  custom_tags=dict([("autogen", True)]),
1322
1356
  )
1323
- score = session.call(
1324
- score_sproc_name,
1357
+ score = score_wrapper_sproc(
1358
+ session,
1325
1359
  query,
1326
1360
  stage_score_file_name,
1327
1361
  identifier.get_unescaped_names(self.input_cols),
1328
1362
  identifier.get_unescaped_names(self.label_cols),
1329
1363
  identifier.get_unescaped_names(self.sample_weight_col),
1330
- statement_params=statement_params,
1364
+ statement_params,
1331
1365
  )
1332
1366
 
1333
1367
  cleanup_temp_files([local_score_file_name])
@@ -1345,18 +1379,20 @@ class XGBRegressor(BaseTransformer):
1345
1379
  if self._sklearn_object._estimator_type == 'classifier':
1346
1380
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1347
1381
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1348
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1382
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1383
+ ([] if self._drop_input_cols else inputs) + outputs)
1349
1384
  # For regressor, the type of predict is float64
1350
1385
  elif self._sklearn_object._estimator_type == 'regressor':
1351
1386
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1352
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1353
-
1387
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1388
+ ([] if self._drop_input_cols else inputs) + outputs)
1354
1389
  for prob_func in PROB_FUNCTIONS:
1355
1390
  if hasattr(self, prob_func):
1356
1391
  output_cols_prefix: str = f"{prob_func}_"
1357
1392
  output_column_names = self._get_output_column_names(output_cols_prefix)
1358
1393
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1359
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1394
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1395
+ ([] if self._drop_input_cols else inputs) + outputs)
1360
1396
 
1361
1397
  @property
1362
1398
  def model_signatures(self) -> Dict[str, ModelSignature]: