snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -221,7 +223,6 @@ class AdaBoostRegressor(BaseTransformer):
221
223
  sample_weight_col: Optional[str] = None,
222
224
  ) -> None:
223
225
  super().__init__()
224
- self.id = str(uuid4()).replace("-", "_").upper()
225
226
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
226
227
  deps = deps | _gather_dependencies(estimator)
227
228
  deps = deps | _gather_dependencies(base_estimator)
@@ -248,6 +249,15 @@ class AdaBoostRegressor(BaseTransformer):
248
249
  self.set_drop_input_cols(drop_input_cols)
249
250
  self.set_sample_weight_col(sample_weight_col)
250
251
 
252
+ def _get_rand_id(self) -> str:
253
+ """
254
+ Generate random id to be used in sproc and stage names.
255
+
256
+ Returns:
257
+ Random id string usable in sproc, table, and stage names.
258
+ """
259
+ return str(uuid4()).replace("-", "_").upper()
260
+
251
261
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
252
262
  """
253
263
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -326,7 +336,7 @@ class AdaBoostRegressor(BaseTransformer):
326
336
  cp.dump(self._sklearn_object, local_transform_file)
327
337
 
328
338
  # Create temp stage to run fit.
329
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
339
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
330
340
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
331
341
  SqlResultValidator(
332
342
  session=session,
@@ -339,11 +349,12 @@ class AdaBoostRegressor(BaseTransformer):
339
349
  expected_value=f"Stage area {transform_stage_name} successfully created."
340
350
  ).validate()
341
351
 
342
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
352
+ # Use posixpath to construct stage paths
353
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
354
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
343
355
  local_result_file_name = get_temp_file_path()
344
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
345
356
 
346
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
357
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
347
358
  statement_params = telemetry.get_function_usage_statement_params(
348
359
  project=_PROJECT,
349
360
  subproject=_SUBPROJECT,
@@ -369,6 +380,7 @@ class AdaBoostRegressor(BaseTransformer):
369
380
  replace=True,
370
381
  session=session,
371
382
  statement_params=statement_params,
383
+ anonymous=True
372
384
  )
373
385
  def fit_wrapper_sproc(
374
386
  session: Session,
@@ -377,7 +389,8 @@ class AdaBoostRegressor(BaseTransformer):
377
389
  stage_result_file_name: str,
378
390
  input_cols: List[str],
379
391
  label_cols: List[str],
380
- sample_weight_col: Optional[str]
392
+ sample_weight_col: Optional[str],
393
+ statement_params: Dict[str, str]
381
394
  ) -> str:
382
395
  import cloudpickle as cp
383
396
  import numpy as np
@@ -444,15 +457,15 @@ class AdaBoostRegressor(BaseTransformer):
444
457
  api_calls=[Session.call],
445
458
  custom_tags=dict([("autogen", True)]),
446
459
  )
447
- sproc_export_file_name = session.call(
448
- fit_sproc_name,
460
+ sproc_export_file_name = fit_wrapper_sproc(
461
+ session,
449
462
  query,
450
463
  stage_transform_file_name,
451
464
  stage_result_file_name,
452
465
  identifier.get_unescaped_names(self.input_cols),
453
466
  identifier.get_unescaped_names(self.label_cols),
454
467
  identifier.get_unescaped_names(self.sample_weight_col),
455
- statement_params=statement_params,
468
+ statement_params,
456
469
  )
457
470
 
458
471
  if "|" in sproc_export_file_name:
@@ -462,7 +475,7 @@ class AdaBoostRegressor(BaseTransformer):
462
475
  print("\n".join(fields[1:]))
463
476
 
464
477
  session.file.get(
465
- os.path.join(stage_result_file_name, sproc_export_file_name),
478
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
466
479
  local_result_file_name,
467
480
  statement_params=statement_params
468
481
  )
@@ -508,7 +521,7 @@ class AdaBoostRegressor(BaseTransformer):
508
521
 
509
522
  # Register vectorized UDF for batch inference
510
523
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
511
- safe_id=self.id, method=inference_method)
524
+ safe_id=self._get_rand_id(), method=inference_method)
512
525
 
513
526
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
514
527
  # will try to pickle all of self which fails.
@@ -600,7 +613,7 @@ class AdaBoostRegressor(BaseTransformer):
600
613
  return transformed_pandas_df.to_dict("records")
601
614
 
602
615
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
603
- safe_id=self.id
616
+ safe_id=self._get_rand_id()
604
617
  )
605
618
 
606
619
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -656,26 +669,37 @@ class AdaBoostRegressor(BaseTransformer):
656
669
  # input cols need to match unquoted / quoted
657
670
  input_cols = self.input_cols
658
671
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
672
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
659
673
 
660
674
  estimator = self._sklearn_object
661
675
 
662
- input_df = dataset[input_cols] # Select input columns with quoted column names.
663
- if hasattr(estimator, "feature_names_in_"):
664
- missing_features = []
665
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
666
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
667
- missing_features.append(f)
668
-
669
- if len(missing_features) > 0:
670
- raise ValueError(
671
- "The feature names should match with those that were passed during fit.\n"
672
- f"Features seen during fit call but not present in the input: {missing_features}\n"
673
- f"Features in the input dataframe : {input_cols}\n"
674
- )
675
- input_df.columns = getattr(estimator, "feature_names_in_")
676
- else:
677
- # Just rename the column names to unquoted identifiers.
678
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
676
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
677
+ missing_features = []
678
+ features_in_dataset = set(dataset.columns)
679
+ columns_to_select = []
680
+ for i, f in enumerate(features_required_by_estimator):
681
+ if (
682
+ i >= len(input_cols)
683
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
684
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
685
+ and quoted_input_cols[i] not in features_in_dataset)
686
+ ):
687
+ missing_features.append(f)
688
+ elif input_cols[i] in features_in_dataset:
689
+ columns_to_select.append(input_cols[i])
690
+ elif unquoted_input_cols[i] in features_in_dataset:
691
+ columns_to_select.append(unquoted_input_cols[i])
692
+ else:
693
+ columns_to_select.append(quoted_input_cols[i])
694
+
695
+ if len(missing_features) > 0:
696
+ raise ValueError(
697
+ "The feature names should match with those that were passed during fit.\n"
698
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
699
+ f"Features in the input dataframe : {input_cols}\n"
700
+ )
701
+ input_df = dataset[columns_to_select]
702
+ input_df.columns = features_required_by_estimator
679
703
 
680
704
  transformed_numpy_array = getattr(estimator, inference_method)(
681
705
  input_df
@@ -756,11 +780,18 @@ class AdaBoostRegressor(BaseTransformer):
756
780
  Transformed dataset.
757
781
  """
758
782
  if isinstance(dataset, DataFrame):
783
+ expected_type_inferred = "float"
784
+ # when it is classifier, infer the datatype from label columns
785
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
786
+ expected_type_inferred = convert_sp_to_sf_type(
787
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
788
+ )
789
+
759
790
  output_df = self._batch_inference(
760
791
  dataset=dataset,
761
792
  inference_method="predict",
762
793
  expected_output_cols_list=self.output_cols,
763
- expected_output_cols_type="float",
794
+ expected_output_cols_type=expected_type_inferred,
764
795
  )
765
796
  elif isinstance(dataset, pd.DataFrame):
766
797
  output_df = self._sklearn_inference(
@@ -831,10 +862,10 @@ class AdaBoostRegressor(BaseTransformer):
831
862
 
832
863
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
833
864
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
834
- Returns an empty list if current object is not a classifier or not yet fitted.
865
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
835
866
  """
836
867
  if getattr(self._sklearn_object, "classes_", None) is None:
837
- return []
868
+ return [output_cols_prefix]
838
869
 
839
870
  classes = self._sklearn_object.classes_
840
871
  if isinstance(classes, numpy.ndarray):
@@ -1059,7 +1090,7 @@ class AdaBoostRegressor(BaseTransformer):
1059
1090
  cp.dump(self._sklearn_object, local_score_file)
1060
1091
 
1061
1092
  # Create temp stage to run score.
1062
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1093
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1063
1094
  session = dataset._session
1064
1095
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1065
1096
  SqlResultValidator(
@@ -1073,8 +1104,9 @@ class AdaBoostRegressor(BaseTransformer):
1073
1104
  expected_value=f"Stage area {score_stage_name} successfully created."
1074
1105
  ).validate()
1075
1106
 
1076
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1077
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1107
+ # Use posixpath to construct stage paths
1108
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1109
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1078
1110
  statement_params = telemetry.get_function_usage_statement_params(
1079
1111
  project=_PROJECT,
1080
1112
  subproject=_SUBPROJECT,
@@ -1100,6 +1132,7 @@ class AdaBoostRegressor(BaseTransformer):
1100
1132
  replace=True,
1101
1133
  session=session,
1102
1134
  statement_params=statement_params,
1135
+ anonymous=True
1103
1136
  )
1104
1137
  def score_wrapper_sproc(
1105
1138
  session: Session,
@@ -1107,7 +1140,8 @@ class AdaBoostRegressor(BaseTransformer):
1107
1140
  stage_score_file_name: str,
1108
1141
  input_cols: List[str],
1109
1142
  label_cols: List[str],
1110
- sample_weight_col: Optional[str]
1143
+ sample_weight_col: Optional[str],
1144
+ statement_params: Dict[str, str]
1111
1145
  ) -> float:
1112
1146
  import cloudpickle as cp
1113
1147
  import numpy as np
@@ -1157,14 +1191,14 @@ class AdaBoostRegressor(BaseTransformer):
1157
1191
  api_calls=[Session.call],
1158
1192
  custom_tags=dict([("autogen", True)]),
1159
1193
  )
1160
- score = session.call(
1161
- score_sproc_name,
1194
+ score = score_wrapper_sproc(
1195
+ session,
1162
1196
  query,
1163
1197
  stage_score_file_name,
1164
1198
  identifier.get_unescaped_names(self.input_cols),
1165
1199
  identifier.get_unescaped_names(self.label_cols),
1166
1200
  identifier.get_unescaped_names(self.sample_weight_col),
1167
- statement_params=statement_params,
1201
+ statement_params,
1168
1202
  )
1169
1203
 
1170
1204
  cleanup_temp_files([local_score_file_name])
@@ -1182,18 +1216,20 @@ class AdaBoostRegressor(BaseTransformer):
1182
1216
  if self._sklearn_object._estimator_type == 'classifier':
1183
1217
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1184
1218
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1185
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1219
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1220
+ ([] if self._drop_input_cols else inputs) + outputs)
1186
1221
  # For regressor, the type of predict is float64
1187
1222
  elif self._sklearn_object._estimator_type == 'regressor':
1188
1223
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1189
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1190
-
1224
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1225
+ ([] if self._drop_input_cols else inputs) + outputs)
1191
1226
  for prob_func in PROB_FUNCTIONS:
1192
1227
  if hasattr(self, prob_func):
1193
1228
  output_cols_prefix: str = f"{prob_func}_"
1194
1229
  output_column_names = self._get_output_column_names(output_cols_prefix)
1195
1230
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1196
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1231
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1232
+ ([] if self._drop_input_cols else inputs) + outputs)
1197
1233
 
1198
1234
  @property
1199
1235
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -250,7 +252,6 @@ class BaggingClassifier(BaseTransformer):
250
252
  sample_weight_col: Optional[str] = None,
251
253
  ) -> None:
252
254
  super().__init__()
253
- self.id = str(uuid4()).replace("-", "_").upper()
254
255
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
255
256
  deps = deps | _gather_dependencies(estimator)
256
257
  deps = deps | _gather_dependencies(base_estimator)
@@ -283,6 +284,15 @@ class BaggingClassifier(BaseTransformer):
283
284
  self.set_drop_input_cols(drop_input_cols)
284
285
  self.set_sample_weight_col(sample_weight_col)
285
286
 
287
+ def _get_rand_id(self) -> str:
288
+ """
289
+ Generate random id to be used in sproc and stage names.
290
+
291
+ Returns:
292
+ Random id string usable in sproc, table, and stage names.
293
+ """
294
+ return str(uuid4()).replace("-", "_").upper()
295
+
286
296
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
287
297
  """
288
298
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -361,7 +371,7 @@ class BaggingClassifier(BaseTransformer):
361
371
  cp.dump(self._sklearn_object, local_transform_file)
362
372
 
363
373
  # Create temp stage to run fit.
364
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
374
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
365
375
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
366
376
  SqlResultValidator(
367
377
  session=session,
@@ -374,11 +384,12 @@ class BaggingClassifier(BaseTransformer):
374
384
  expected_value=f"Stage area {transform_stage_name} successfully created."
375
385
  ).validate()
376
386
 
377
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
387
+ # Use posixpath to construct stage paths
388
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
389
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
378
390
  local_result_file_name = get_temp_file_path()
379
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
380
391
 
381
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
392
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
382
393
  statement_params = telemetry.get_function_usage_statement_params(
383
394
  project=_PROJECT,
384
395
  subproject=_SUBPROJECT,
@@ -404,6 +415,7 @@ class BaggingClassifier(BaseTransformer):
404
415
  replace=True,
405
416
  session=session,
406
417
  statement_params=statement_params,
418
+ anonymous=True
407
419
  )
408
420
  def fit_wrapper_sproc(
409
421
  session: Session,
@@ -412,7 +424,8 @@ class BaggingClassifier(BaseTransformer):
412
424
  stage_result_file_name: str,
413
425
  input_cols: List[str],
414
426
  label_cols: List[str],
415
- sample_weight_col: Optional[str]
427
+ sample_weight_col: Optional[str],
428
+ statement_params: Dict[str, str]
416
429
  ) -> str:
417
430
  import cloudpickle as cp
418
431
  import numpy as np
@@ -479,15 +492,15 @@ class BaggingClassifier(BaseTransformer):
479
492
  api_calls=[Session.call],
480
493
  custom_tags=dict([("autogen", True)]),
481
494
  )
482
- sproc_export_file_name = session.call(
483
- fit_sproc_name,
495
+ sproc_export_file_name = fit_wrapper_sproc(
496
+ session,
484
497
  query,
485
498
  stage_transform_file_name,
486
499
  stage_result_file_name,
487
500
  identifier.get_unescaped_names(self.input_cols),
488
501
  identifier.get_unescaped_names(self.label_cols),
489
502
  identifier.get_unescaped_names(self.sample_weight_col),
490
- statement_params=statement_params,
503
+ statement_params,
491
504
  )
492
505
 
493
506
  if "|" in sproc_export_file_name:
@@ -497,7 +510,7 @@ class BaggingClassifier(BaseTransformer):
497
510
  print("\n".join(fields[1:]))
498
511
 
499
512
  session.file.get(
500
- os.path.join(stage_result_file_name, sproc_export_file_name),
513
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
501
514
  local_result_file_name,
502
515
  statement_params=statement_params
503
516
  )
@@ -543,7 +556,7 @@ class BaggingClassifier(BaseTransformer):
543
556
 
544
557
  # Register vectorized UDF for batch inference
545
558
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
546
- safe_id=self.id, method=inference_method)
559
+ safe_id=self._get_rand_id(), method=inference_method)
547
560
 
548
561
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
549
562
  # will try to pickle all of self which fails.
@@ -635,7 +648,7 @@ class BaggingClassifier(BaseTransformer):
635
648
  return transformed_pandas_df.to_dict("records")
636
649
 
637
650
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
638
- safe_id=self.id
651
+ safe_id=self._get_rand_id()
639
652
  )
640
653
 
641
654
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -691,26 +704,37 @@ class BaggingClassifier(BaseTransformer):
691
704
  # input cols need to match unquoted / quoted
692
705
  input_cols = self.input_cols
693
706
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
707
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
694
708
 
695
709
  estimator = self._sklearn_object
696
710
 
697
- input_df = dataset[input_cols] # Select input columns with quoted column names.
698
- if hasattr(estimator, "feature_names_in_"):
699
- missing_features = []
700
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
701
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
702
- missing_features.append(f)
703
-
704
- if len(missing_features) > 0:
705
- raise ValueError(
706
- "The feature names should match with those that were passed during fit.\n"
707
- f"Features seen during fit call but not present in the input: {missing_features}\n"
708
- f"Features in the input dataframe : {input_cols}\n"
709
- )
710
- input_df.columns = getattr(estimator, "feature_names_in_")
711
- else:
712
- # Just rename the column names to unquoted identifiers.
713
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
711
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
712
+ missing_features = []
713
+ features_in_dataset = set(dataset.columns)
714
+ columns_to_select = []
715
+ for i, f in enumerate(features_required_by_estimator):
716
+ if (
717
+ i >= len(input_cols)
718
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
719
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
720
+ and quoted_input_cols[i] not in features_in_dataset)
721
+ ):
722
+ missing_features.append(f)
723
+ elif input_cols[i] in features_in_dataset:
724
+ columns_to_select.append(input_cols[i])
725
+ elif unquoted_input_cols[i] in features_in_dataset:
726
+ columns_to_select.append(unquoted_input_cols[i])
727
+ else:
728
+ columns_to_select.append(quoted_input_cols[i])
729
+
730
+ if len(missing_features) > 0:
731
+ raise ValueError(
732
+ "The feature names should match with those that were passed during fit.\n"
733
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
734
+ f"Features in the input dataframe : {input_cols}\n"
735
+ )
736
+ input_df = dataset[columns_to_select]
737
+ input_df.columns = features_required_by_estimator
714
738
 
715
739
  transformed_numpy_array = getattr(estimator, inference_method)(
716
740
  input_df
@@ -791,11 +815,18 @@ class BaggingClassifier(BaseTransformer):
791
815
  Transformed dataset.
792
816
  """
793
817
  if isinstance(dataset, DataFrame):
818
+ expected_type_inferred = ""
819
+ # when it is classifier, infer the datatype from label columns
820
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
821
+ expected_type_inferred = convert_sp_to_sf_type(
822
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
823
+ )
824
+
794
825
  output_df = self._batch_inference(
795
826
  dataset=dataset,
796
827
  inference_method="predict",
797
828
  expected_output_cols_list=self.output_cols,
798
- expected_output_cols_type="",
829
+ expected_output_cols_type=expected_type_inferred,
799
830
  )
800
831
  elif isinstance(dataset, pd.DataFrame):
801
832
  output_df = self._sklearn_inference(
@@ -866,10 +897,10 @@ class BaggingClassifier(BaseTransformer):
866
897
 
867
898
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
868
899
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
869
- Returns an empty list if current object is not a classifier or not yet fitted.
900
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
870
901
  """
871
902
  if getattr(self._sklearn_object, "classes_", None) is None:
872
- return []
903
+ return [output_cols_prefix]
873
904
 
874
905
  classes = self._sklearn_object.classes_
875
906
  if isinstance(classes, numpy.ndarray):
@@ -1100,7 +1131,7 @@ class BaggingClassifier(BaseTransformer):
1100
1131
  cp.dump(self._sklearn_object, local_score_file)
1101
1132
 
1102
1133
  # Create temp stage to run score.
1103
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1134
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1104
1135
  session = dataset._session
1105
1136
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1106
1137
  SqlResultValidator(
@@ -1114,8 +1145,9 @@ class BaggingClassifier(BaseTransformer):
1114
1145
  expected_value=f"Stage area {score_stage_name} successfully created."
1115
1146
  ).validate()
1116
1147
 
1117
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1118
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1148
+ # Use posixpath to construct stage paths
1149
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1150
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1119
1151
  statement_params = telemetry.get_function_usage_statement_params(
1120
1152
  project=_PROJECT,
1121
1153
  subproject=_SUBPROJECT,
@@ -1141,6 +1173,7 @@ class BaggingClassifier(BaseTransformer):
1141
1173
  replace=True,
1142
1174
  session=session,
1143
1175
  statement_params=statement_params,
1176
+ anonymous=True
1144
1177
  )
1145
1178
  def score_wrapper_sproc(
1146
1179
  session: Session,
@@ -1148,7 +1181,8 @@ class BaggingClassifier(BaseTransformer):
1148
1181
  stage_score_file_name: str,
1149
1182
  input_cols: List[str],
1150
1183
  label_cols: List[str],
1151
- sample_weight_col: Optional[str]
1184
+ sample_weight_col: Optional[str],
1185
+ statement_params: Dict[str, str]
1152
1186
  ) -> float:
1153
1187
  import cloudpickle as cp
1154
1188
  import numpy as np
@@ -1198,14 +1232,14 @@ class BaggingClassifier(BaseTransformer):
1198
1232
  api_calls=[Session.call],
1199
1233
  custom_tags=dict([("autogen", True)]),
1200
1234
  )
1201
- score = session.call(
1202
- score_sproc_name,
1235
+ score = score_wrapper_sproc(
1236
+ session,
1203
1237
  query,
1204
1238
  stage_score_file_name,
1205
1239
  identifier.get_unescaped_names(self.input_cols),
1206
1240
  identifier.get_unescaped_names(self.label_cols),
1207
1241
  identifier.get_unescaped_names(self.sample_weight_col),
1208
- statement_params=statement_params,
1242
+ statement_params,
1209
1243
  )
1210
1244
 
1211
1245
  cleanup_temp_files([local_score_file_name])
@@ -1223,18 +1257,20 @@ class BaggingClassifier(BaseTransformer):
1223
1257
  if self._sklearn_object._estimator_type == 'classifier':
1224
1258
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1225
1259
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1226
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1260
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1261
+ ([] if self._drop_input_cols else inputs) + outputs)
1227
1262
  # For regressor, the type of predict is float64
1228
1263
  elif self._sklearn_object._estimator_type == 'regressor':
1229
1264
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1230
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1231
-
1265
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1266
+ ([] if self._drop_input_cols else inputs) + outputs)
1232
1267
  for prob_func in PROB_FUNCTIONS:
1233
1268
  if hasattr(self, prob_func):
1234
1269
  output_cols_prefix: str = f"{prob_func}_"
1235
1270
  output_column_names = self._get_output_column_names(output_cols_prefix)
1236
1271
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1237
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1272
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1273
+ ([] if self._drop_input_cols else inputs) + outputs)
1238
1274
 
1239
1275
  @property
1240
1276
  def model_signatures(self) -> Dict[str, ModelSignature]: