snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -328,7 +330,6 @@ class ExtraTreesRegressor(BaseTransformer):
328
330
  sample_weight_col: Optional[str] = None,
329
331
  ) -> None:
330
332
  super().__init__()
331
- self.id = str(uuid4()).replace("-", "_").upper()
332
333
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
333
334
 
334
335
  self._deps = list(deps)
@@ -364,6 +365,15 @@ class ExtraTreesRegressor(BaseTransformer):
364
365
  self.set_drop_input_cols(drop_input_cols)
365
366
  self.set_sample_weight_col(sample_weight_col)
366
367
 
368
+ def _get_rand_id(self) -> str:
369
+ """
370
+ Generate random id to be used in sproc and stage names.
371
+
372
+ Returns:
373
+ Random id string usable in sproc, table, and stage names.
374
+ """
375
+ return str(uuid4()).replace("-", "_").upper()
376
+
367
377
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
368
378
  """
369
379
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -442,7 +452,7 @@ class ExtraTreesRegressor(BaseTransformer):
442
452
  cp.dump(self._sklearn_object, local_transform_file)
443
453
 
444
454
  # Create temp stage to run fit.
445
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
455
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
446
456
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
447
457
  SqlResultValidator(
448
458
  session=session,
@@ -455,11 +465,12 @@ class ExtraTreesRegressor(BaseTransformer):
455
465
  expected_value=f"Stage area {transform_stage_name} successfully created."
456
466
  ).validate()
457
467
 
458
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
468
+ # Use posixpath to construct stage paths
469
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
470
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
459
471
  local_result_file_name = get_temp_file_path()
460
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
461
472
 
462
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
473
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
463
474
  statement_params = telemetry.get_function_usage_statement_params(
464
475
  project=_PROJECT,
465
476
  subproject=_SUBPROJECT,
@@ -485,6 +496,7 @@ class ExtraTreesRegressor(BaseTransformer):
485
496
  replace=True,
486
497
  session=session,
487
498
  statement_params=statement_params,
499
+ anonymous=True
488
500
  )
489
501
  def fit_wrapper_sproc(
490
502
  session: Session,
@@ -493,7 +505,8 @@ class ExtraTreesRegressor(BaseTransformer):
493
505
  stage_result_file_name: str,
494
506
  input_cols: List[str],
495
507
  label_cols: List[str],
496
- sample_weight_col: Optional[str]
508
+ sample_weight_col: Optional[str],
509
+ statement_params: Dict[str, str]
497
510
  ) -> str:
498
511
  import cloudpickle as cp
499
512
  import numpy as np
@@ -560,15 +573,15 @@ class ExtraTreesRegressor(BaseTransformer):
560
573
  api_calls=[Session.call],
561
574
  custom_tags=dict([("autogen", True)]),
562
575
  )
563
- sproc_export_file_name = session.call(
564
- fit_sproc_name,
576
+ sproc_export_file_name = fit_wrapper_sproc(
577
+ session,
565
578
  query,
566
579
  stage_transform_file_name,
567
580
  stage_result_file_name,
568
581
  identifier.get_unescaped_names(self.input_cols),
569
582
  identifier.get_unescaped_names(self.label_cols),
570
583
  identifier.get_unescaped_names(self.sample_weight_col),
571
- statement_params=statement_params,
584
+ statement_params,
572
585
  )
573
586
 
574
587
  if "|" in sproc_export_file_name:
@@ -578,7 +591,7 @@ class ExtraTreesRegressor(BaseTransformer):
578
591
  print("\n".join(fields[1:]))
579
592
 
580
593
  session.file.get(
581
- os.path.join(stage_result_file_name, sproc_export_file_name),
594
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
582
595
  local_result_file_name,
583
596
  statement_params=statement_params
584
597
  )
@@ -624,7 +637,7 @@ class ExtraTreesRegressor(BaseTransformer):
624
637
 
625
638
  # Register vectorized UDF for batch inference
626
639
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
627
- safe_id=self.id, method=inference_method)
640
+ safe_id=self._get_rand_id(), method=inference_method)
628
641
 
629
642
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
630
643
  # will try to pickle all of self which fails.
@@ -716,7 +729,7 @@ class ExtraTreesRegressor(BaseTransformer):
716
729
  return transformed_pandas_df.to_dict("records")
717
730
 
718
731
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
719
- safe_id=self.id
732
+ safe_id=self._get_rand_id()
720
733
  )
721
734
 
722
735
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -772,26 +785,37 @@ class ExtraTreesRegressor(BaseTransformer):
772
785
  # input cols need to match unquoted / quoted
773
786
  input_cols = self.input_cols
774
787
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
788
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
775
789
 
776
790
  estimator = self._sklearn_object
777
791
 
778
- input_df = dataset[input_cols] # Select input columns with quoted column names.
779
- if hasattr(estimator, "feature_names_in_"):
780
- missing_features = []
781
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
782
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
783
- missing_features.append(f)
784
-
785
- if len(missing_features) > 0:
786
- raise ValueError(
787
- "The feature names should match with those that were passed during fit.\n"
788
- f"Features seen during fit call but not present in the input: {missing_features}\n"
789
- f"Features in the input dataframe : {input_cols}\n"
790
- )
791
- input_df.columns = getattr(estimator, "feature_names_in_")
792
- else:
793
- # Just rename the column names to unquoted identifiers.
794
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
792
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
793
+ missing_features = []
794
+ features_in_dataset = set(dataset.columns)
795
+ columns_to_select = []
796
+ for i, f in enumerate(features_required_by_estimator):
797
+ if (
798
+ i >= len(input_cols)
799
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
800
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
801
+ and quoted_input_cols[i] not in features_in_dataset)
802
+ ):
803
+ missing_features.append(f)
804
+ elif input_cols[i] in features_in_dataset:
805
+ columns_to_select.append(input_cols[i])
806
+ elif unquoted_input_cols[i] in features_in_dataset:
807
+ columns_to_select.append(unquoted_input_cols[i])
808
+ else:
809
+ columns_to_select.append(quoted_input_cols[i])
810
+
811
+ if len(missing_features) > 0:
812
+ raise ValueError(
813
+ "The feature names should match with those that were passed during fit.\n"
814
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
815
+ f"Features in the input dataframe : {input_cols}\n"
816
+ )
817
+ input_df = dataset[columns_to_select]
818
+ input_df.columns = features_required_by_estimator
795
819
 
796
820
  transformed_numpy_array = getattr(estimator, inference_method)(
797
821
  input_df
@@ -872,11 +896,18 @@ class ExtraTreesRegressor(BaseTransformer):
872
896
  Transformed dataset.
873
897
  """
874
898
  if isinstance(dataset, DataFrame):
899
+ expected_type_inferred = "float"
900
+ # when it is classifier, infer the datatype from label columns
901
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
902
+ expected_type_inferred = convert_sp_to_sf_type(
903
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
904
+ )
905
+
875
906
  output_df = self._batch_inference(
876
907
  dataset=dataset,
877
908
  inference_method="predict",
878
909
  expected_output_cols_list=self.output_cols,
879
- expected_output_cols_type="float",
910
+ expected_output_cols_type=expected_type_inferred,
880
911
  )
881
912
  elif isinstance(dataset, pd.DataFrame):
882
913
  output_df = self._sklearn_inference(
@@ -947,10 +978,10 @@ class ExtraTreesRegressor(BaseTransformer):
947
978
 
948
979
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
949
980
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
950
- Returns an empty list if current object is not a classifier or not yet fitted.
981
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
951
982
  """
952
983
  if getattr(self._sklearn_object, "classes_", None) is None:
953
- return []
984
+ return [output_cols_prefix]
954
985
 
955
986
  classes = self._sklearn_object.classes_
956
987
  if isinstance(classes, numpy.ndarray):
@@ -1175,7 +1206,7 @@ class ExtraTreesRegressor(BaseTransformer):
1175
1206
  cp.dump(self._sklearn_object, local_score_file)
1176
1207
 
1177
1208
  # Create temp stage to run score.
1178
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1209
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1179
1210
  session = dataset._session
1180
1211
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1181
1212
  SqlResultValidator(
@@ -1189,8 +1220,9 @@ class ExtraTreesRegressor(BaseTransformer):
1189
1220
  expected_value=f"Stage area {score_stage_name} successfully created."
1190
1221
  ).validate()
1191
1222
 
1192
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1193
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1223
+ # Use posixpath to construct stage paths
1224
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1225
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1194
1226
  statement_params = telemetry.get_function_usage_statement_params(
1195
1227
  project=_PROJECT,
1196
1228
  subproject=_SUBPROJECT,
@@ -1216,6 +1248,7 @@ class ExtraTreesRegressor(BaseTransformer):
1216
1248
  replace=True,
1217
1249
  session=session,
1218
1250
  statement_params=statement_params,
1251
+ anonymous=True
1219
1252
  )
1220
1253
  def score_wrapper_sproc(
1221
1254
  session: Session,
@@ -1223,7 +1256,8 @@ class ExtraTreesRegressor(BaseTransformer):
1223
1256
  stage_score_file_name: str,
1224
1257
  input_cols: List[str],
1225
1258
  label_cols: List[str],
1226
- sample_weight_col: Optional[str]
1259
+ sample_weight_col: Optional[str],
1260
+ statement_params: Dict[str, str]
1227
1261
  ) -> float:
1228
1262
  import cloudpickle as cp
1229
1263
  import numpy as np
@@ -1273,14 +1307,14 @@ class ExtraTreesRegressor(BaseTransformer):
1273
1307
  api_calls=[Session.call],
1274
1308
  custom_tags=dict([("autogen", True)]),
1275
1309
  )
1276
- score = session.call(
1277
- score_sproc_name,
1310
+ score = score_wrapper_sproc(
1311
+ session,
1278
1312
  query,
1279
1313
  stage_score_file_name,
1280
1314
  identifier.get_unescaped_names(self.input_cols),
1281
1315
  identifier.get_unescaped_names(self.label_cols),
1282
1316
  identifier.get_unescaped_names(self.sample_weight_col),
1283
- statement_params=statement_params,
1317
+ statement_params,
1284
1318
  )
1285
1319
 
1286
1320
  cleanup_temp_files([local_score_file_name])
@@ -1298,18 +1332,20 @@ class ExtraTreesRegressor(BaseTransformer):
1298
1332
  if self._sklearn_object._estimator_type == 'classifier':
1299
1333
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1300
1334
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1301
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1335
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1336
+ ([] if self._drop_input_cols else inputs) + outputs)
1302
1337
  # For regressor, the type of predict is float64
1303
1338
  elif self._sklearn_object._estimator_type == 'regressor':
1304
1339
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1305
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1306
-
1340
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1341
+ ([] if self._drop_input_cols else inputs) + outputs)
1307
1342
  for prob_func in PROB_FUNCTIONS:
1308
1343
  if hasattr(self, prob_func):
1309
1344
  output_cols_prefix: str = f"{prob_func}_"
1310
1345
  output_column_names = self._get_output_column_names(output_cols_prefix)
1311
1346
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1312
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1347
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1348
+ ([] if self._drop_input_cols else inputs) + outputs)
1313
1349
 
1314
1350
  @property
1315
1351
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -360,7 +362,6 @@ class GradientBoostingClassifier(BaseTransformer):
360
362
  sample_weight_col: Optional[str] = None,
361
363
  ) -> None:
362
364
  super().__init__()
363
- self.id = str(uuid4()).replace("-", "_").upper()
364
365
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
365
366
 
366
367
  self._deps = list(deps)
@@ -399,6 +400,15 @@ class GradientBoostingClassifier(BaseTransformer):
399
400
  self.set_drop_input_cols(drop_input_cols)
400
401
  self.set_sample_weight_col(sample_weight_col)
401
402
 
403
+ def _get_rand_id(self) -> str:
404
+ """
405
+ Generate random id to be used in sproc and stage names.
406
+
407
+ Returns:
408
+ Random id string usable in sproc, table, and stage names.
409
+ """
410
+ return str(uuid4()).replace("-", "_").upper()
411
+
402
412
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
403
413
  """
404
414
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -477,7 +487,7 @@ class GradientBoostingClassifier(BaseTransformer):
477
487
  cp.dump(self._sklearn_object, local_transform_file)
478
488
 
479
489
  # Create temp stage to run fit.
480
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
490
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
481
491
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
482
492
  SqlResultValidator(
483
493
  session=session,
@@ -490,11 +500,12 @@ class GradientBoostingClassifier(BaseTransformer):
490
500
  expected_value=f"Stage area {transform_stage_name} successfully created."
491
501
  ).validate()
492
502
 
493
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
503
+ # Use posixpath to construct stage paths
504
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
505
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
494
506
  local_result_file_name = get_temp_file_path()
495
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
496
507
 
497
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
508
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
498
509
  statement_params = telemetry.get_function_usage_statement_params(
499
510
  project=_PROJECT,
500
511
  subproject=_SUBPROJECT,
@@ -520,6 +531,7 @@ class GradientBoostingClassifier(BaseTransformer):
520
531
  replace=True,
521
532
  session=session,
522
533
  statement_params=statement_params,
534
+ anonymous=True
523
535
  )
524
536
  def fit_wrapper_sproc(
525
537
  session: Session,
@@ -528,7 +540,8 @@ class GradientBoostingClassifier(BaseTransformer):
528
540
  stage_result_file_name: str,
529
541
  input_cols: List[str],
530
542
  label_cols: List[str],
531
- sample_weight_col: Optional[str]
543
+ sample_weight_col: Optional[str],
544
+ statement_params: Dict[str, str]
532
545
  ) -> str:
533
546
  import cloudpickle as cp
534
547
  import numpy as np
@@ -595,15 +608,15 @@ class GradientBoostingClassifier(BaseTransformer):
595
608
  api_calls=[Session.call],
596
609
  custom_tags=dict([("autogen", True)]),
597
610
  )
598
- sproc_export_file_name = session.call(
599
- fit_sproc_name,
611
+ sproc_export_file_name = fit_wrapper_sproc(
612
+ session,
600
613
  query,
601
614
  stage_transform_file_name,
602
615
  stage_result_file_name,
603
616
  identifier.get_unescaped_names(self.input_cols),
604
617
  identifier.get_unescaped_names(self.label_cols),
605
618
  identifier.get_unescaped_names(self.sample_weight_col),
606
- statement_params=statement_params,
619
+ statement_params,
607
620
  )
608
621
 
609
622
  if "|" in sproc_export_file_name:
@@ -613,7 +626,7 @@ class GradientBoostingClassifier(BaseTransformer):
613
626
  print("\n".join(fields[1:]))
614
627
 
615
628
  session.file.get(
616
- os.path.join(stage_result_file_name, sproc_export_file_name),
629
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
617
630
  local_result_file_name,
618
631
  statement_params=statement_params
619
632
  )
@@ -659,7 +672,7 @@ class GradientBoostingClassifier(BaseTransformer):
659
672
 
660
673
  # Register vectorized UDF for batch inference
661
674
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
662
- safe_id=self.id, method=inference_method)
675
+ safe_id=self._get_rand_id(), method=inference_method)
663
676
 
664
677
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
665
678
  # will try to pickle all of self which fails.
@@ -751,7 +764,7 @@ class GradientBoostingClassifier(BaseTransformer):
751
764
  return transformed_pandas_df.to_dict("records")
752
765
 
753
766
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
754
- safe_id=self.id
767
+ safe_id=self._get_rand_id()
755
768
  )
756
769
 
757
770
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -807,26 +820,37 @@ class GradientBoostingClassifier(BaseTransformer):
807
820
  # input cols need to match unquoted / quoted
808
821
  input_cols = self.input_cols
809
822
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
823
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
810
824
 
811
825
  estimator = self._sklearn_object
812
826
 
813
- input_df = dataset[input_cols] # Select input columns with quoted column names.
814
- if hasattr(estimator, "feature_names_in_"):
815
- missing_features = []
816
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
817
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
818
- missing_features.append(f)
819
-
820
- if len(missing_features) > 0:
821
- raise ValueError(
822
- "The feature names should match with those that were passed during fit.\n"
823
- f"Features seen during fit call but not present in the input: {missing_features}\n"
824
- f"Features in the input dataframe : {input_cols}\n"
825
- )
826
- input_df.columns = getattr(estimator, "feature_names_in_")
827
- else:
828
- # Just rename the column names to unquoted identifiers.
829
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
827
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
828
+ missing_features = []
829
+ features_in_dataset = set(dataset.columns)
830
+ columns_to_select = []
831
+ for i, f in enumerate(features_required_by_estimator):
832
+ if (
833
+ i >= len(input_cols)
834
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
835
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
836
+ and quoted_input_cols[i] not in features_in_dataset)
837
+ ):
838
+ missing_features.append(f)
839
+ elif input_cols[i] in features_in_dataset:
840
+ columns_to_select.append(input_cols[i])
841
+ elif unquoted_input_cols[i] in features_in_dataset:
842
+ columns_to_select.append(unquoted_input_cols[i])
843
+ else:
844
+ columns_to_select.append(quoted_input_cols[i])
845
+
846
+ if len(missing_features) > 0:
847
+ raise ValueError(
848
+ "The feature names should match with those that were passed during fit.\n"
849
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
850
+ f"Features in the input dataframe : {input_cols}\n"
851
+ )
852
+ input_df = dataset[columns_to_select]
853
+ input_df.columns = features_required_by_estimator
830
854
 
831
855
  transformed_numpy_array = getattr(estimator, inference_method)(
832
856
  input_df
@@ -907,11 +931,18 @@ class GradientBoostingClassifier(BaseTransformer):
907
931
  Transformed dataset.
908
932
  """
909
933
  if isinstance(dataset, DataFrame):
934
+ expected_type_inferred = ""
935
+ # when it is classifier, infer the datatype from label columns
936
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
937
+ expected_type_inferred = convert_sp_to_sf_type(
938
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
939
+ )
940
+
910
941
  output_df = self._batch_inference(
911
942
  dataset=dataset,
912
943
  inference_method="predict",
913
944
  expected_output_cols_list=self.output_cols,
914
- expected_output_cols_type="",
945
+ expected_output_cols_type=expected_type_inferred,
915
946
  )
916
947
  elif isinstance(dataset, pd.DataFrame):
917
948
  output_df = self._sklearn_inference(
@@ -982,10 +1013,10 @@ class GradientBoostingClassifier(BaseTransformer):
982
1013
 
983
1014
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
984
1015
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
985
- Returns an empty list if current object is not a classifier or not yet fitted.
1016
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
986
1017
  """
987
1018
  if getattr(self._sklearn_object, "classes_", None) is None:
988
- return []
1019
+ return [output_cols_prefix]
989
1020
 
990
1021
  classes = self._sklearn_object.classes_
991
1022
  if isinstance(classes, numpy.ndarray):
@@ -1216,7 +1247,7 @@ class GradientBoostingClassifier(BaseTransformer):
1216
1247
  cp.dump(self._sklearn_object, local_score_file)
1217
1248
 
1218
1249
  # Create temp stage to run score.
1219
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1250
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1220
1251
  session = dataset._session
1221
1252
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1222
1253
  SqlResultValidator(
@@ -1230,8 +1261,9 @@ class GradientBoostingClassifier(BaseTransformer):
1230
1261
  expected_value=f"Stage area {score_stage_name} successfully created."
1231
1262
  ).validate()
1232
1263
 
1233
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1234
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1264
+ # Use posixpath to construct stage paths
1265
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1266
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1235
1267
  statement_params = telemetry.get_function_usage_statement_params(
1236
1268
  project=_PROJECT,
1237
1269
  subproject=_SUBPROJECT,
@@ -1257,6 +1289,7 @@ class GradientBoostingClassifier(BaseTransformer):
1257
1289
  replace=True,
1258
1290
  session=session,
1259
1291
  statement_params=statement_params,
1292
+ anonymous=True
1260
1293
  )
1261
1294
  def score_wrapper_sproc(
1262
1295
  session: Session,
@@ -1264,7 +1297,8 @@ class GradientBoostingClassifier(BaseTransformer):
1264
1297
  stage_score_file_name: str,
1265
1298
  input_cols: List[str],
1266
1299
  label_cols: List[str],
1267
- sample_weight_col: Optional[str]
1300
+ sample_weight_col: Optional[str],
1301
+ statement_params: Dict[str, str]
1268
1302
  ) -> float:
1269
1303
  import cloudpickle as cp
1270
1304
  import numpy as np
@@ -1314,14 +1348,14 @@ class GradientBoostingClassifier(BaseTransformer):
1314
1348
  api_calls=[Session.call],
1315
1349
  custom_tags=dict([("autogen", True)]),
1316
1350
  )
1317
- score = session.call(
1318
- score_sproc_name,
1351
+ score = score_wrapper_sproc(
1352
+ session,
1319
1353
  query,
1320
1354
  stage_score_file_name,
1321
1355
  identifier.get_unescaped_names(self.input_cols),
1322
1356
  identifier.get_unescaped_names(self.label_cols),
1323
1357
  identifier.get_unescaped_names(self.sample_weight_col),
1324
- statement_params=statement_params,
1358
+ statement_params,
1325
1359
  )
1326
1360
 
1327
1361
  cleanup_temp_files([local_score_file_name])
@@ -1339,18 +1373,20 @@ class GradientBoostingClassifier(BaseTransformer):
1339
1373
  if self._sklearn_object._estimator_type == 'classifier':
1340
1374
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1341
1375
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1342
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1376
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1377
+ ([] if self._drop_input_cols else inputs) + outputs)
1343
1378
  # For regressor, the type of predict is float64
1344
1379
  elif self._sklearn_object._estimator_type == 'regressor':
1345
1380
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1346
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1347
-
1381
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1382
+ ([] if self._drop_input_cols else inputs) + outputs)
1348
1383
  for prob_func in PROB_FUNCTIONS:
1349
1384
  if hasattr(self, prob_func):
1350
1385
  output_cols_prefix: str = f"{prob_func}_"
1351
1386
  output_column_names = self._get_output_column_names(output_cols_prefix)
1352
1387
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1353
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1388
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1389
+ ([] if self._drop_input_cols else inputs) + outputs)
1354
1390
 
1355
1391
  @property
1356
1392
  def model_signatures(self) -> Dict[str, ModelSignature]: