snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -235,7 +237,6 @@ class StackingRegressor(BaseTransformer):
235
237
  sample_weight_col: Optional[str] = None,
236
238
  ) -> None:
237
239
  super().__init__()
238
- self.id = str(uuid4()).replace("-", "_").upper()
239
240
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
240
241
  deps = deps | _gather_dependencies(estimators)
241
242
  deps = deps | _gather_dependencies(final_estimator)
@@ -262,6 +263,15 @@ class StackingRegressor(BaseTransformer):
262
263
  self.set_drop_input_cols(drop_input_cols)
263
264
  self.set_sample_weight_col(sample_weight_col)
264
265
 
266
+ def _get_rand_id(self) -> str:
267
+ """
268
+ Generate random id to be used in sproc and stage names.
269
+
270
+ Returns:
271
+ Random id string usable in sproc, table, and stage names.
272
+ """
273
+ return str(uuid4()).replace("-", "_").upper()
274
+
265
275
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
266
276
  """
267
277
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -340,7 +350,7 @@ class StackingRegressor(BaseTransformer):
340
350
  cp.dump(self._sklearn_object, local_transform_file)
341
351
 
342
352
  # Create temp stage to run fit.
343
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
353
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
344
354
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
345
355
  SqlResultValidator(
346
356
  session=session,
@@ -353,11 +363,12 @@ class StackingRegressor(BaseTransformer):
353
363
  expected_value=f"Stage area {transform_stage_name} successfully created."
354
364
  ).validate()
355
365
 
356
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
366
+ # Use posixpath to construct stage paths
367
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
368
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
357
369
  local_result_file_name = get_temp_file_path()
358
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
359
370
 
360
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
371
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
361
372
  statement_params = telemetry.get_function_usage_statement_params(
362
373
  project=_PROJECT,
363
374
  subproject=_SUBPROJECT,
@@ -383,6 +394,7 @@ class StackingRegressor(BaseTransformer):
383
394
  replace=True,
384
395
  session=session,
385
396
  statement_params=statement_params,
397
+ anonymous=True
386
398
  )
387
399
  def fit_wrapper_sproc(
388
400
  session: Session,
@@ -391,7 +403,8 @@ class StackingRegressor(BaseTransformer):
391
403
  stage_result_file_name: str,
392
404
  input_cols: List[str],
393
405
  label_cols: List[str],
394
- sample_weight_col: Optional[str]
406
+ sample_weight_col: Optional[str],
407
+ statement_params: Dict[str, str]
395
408
  ) -> str:
396
409
  import cloudpickle as cp
397
410
  import numpy as np
@@ -458,15 +471,15 @@ class StackingRegressor(BaseTransformer):
458
471
  api_calls=[Session.call],
459
472
  custom_tags=dict([("autogen", True)]),
460
473
  )
461
- sproc_export_file_name = session.call(
462
- fit_sproc_name,
474
+ sproc_export_file_name = fit_wrapper_sproc(
475
+ session,
463
476
  query,
464
477
  stage_transform_file_name,
465
478
  stage_result_file_name,
466
479
  identifier.get_unescaped_names(self.input_cols),
467
480
  identifier.get_unescaped_names(self.label_cols),
468
481
  identifier.get_unescaped_names(self.sample_weight_col),
469
- statement_params=statement_params,
482
+ statement_params,
470
483
  )
471
484
 
472
485
  if "|" in sproc_export_file_name:
@@ -476,7 +489,7 @@ class StackingRegressor(BaseTransformer):
476
489
  print("\n".join(fields[1:]))
477
490
 
478
491
  session.file.get(
479
- os.path.join(stage_result_file_name, sproc_export_file_name),
492
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
480
493
  local_result_file_name,
481
494
  statement_params=statement_params
482
495
  )
@@ -522,7 +535,7 @@ class StackingRegressor(BaseTransformer):
522
535
 
523
536
  # Register vectorized UDF for batch inference
524
537
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
525
- safe_id=self.id, method=inference_method)
538
+ safe_id=self._get_rand_id(), method=inference_method)
526
539
 
527
540
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
528
541
  # will try to pickle all of self which fails.
@@ -614,7 +627,7 @@ class StackingRegressor(BaseTransformer):
614
627
  return transformed_pandas_df.to_dict("records")
615
628
 
616
629
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
617
- safe_id=self.id
630
+ safe_id=self._get_rand_id()
618
631
  )
619
632
 
620
633
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -670,26 +683,37 @@ class StackingRegressor(BaseTransformer):
670
683
  # input cols need to match unquoted / quoted
671
684
  input_cols = self.input_cols
672
685
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
686
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
673
687
 
674
688
  estimator = self._sklearn_object
675
689
 
676
- input_df = dataset[input_cols] # Select input columns with quoted column names.
677
- if hasattr(estimator, "feature_names_in_"):
678
- missing_features = []
679
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
680
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
681
- missing_features.append(f)
682
-
683
- if len(missing_features) > 0:
684
- raise ValueError(
685
- "The feature names should match with those that were passed during fit.\n"
686
- f"Features seen during fit call but not present in the input: {missing_features}\n"
687
- f"Features in the input dataframe : {input_cols}\n"
688
- )
689
- input_df.columns = getattr(estimator, "feature_names_in_")
690
- else:
691
- # Just rename the column names to unquoted identifiers.
692
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
690
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
691
+ missing_features = []
692
+ features_in_dataset = set(dataset.columns)
693
+ columns_to_select = []
694
+ for i, f in enumerate(features_required_by_estimator):
695
+ if (
696
+ i >= len(input_cols)
697
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
698
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
699
+ and quoted_input_cols[i] not in features_in_dataset)
700
+ ):
701
+ missing_features.append(f)
702
+ elif input_cols[i] in features_in_dataset:
703
+ columns_to_select.append(input_cols[i])
704
+ elif unquoted_input_cols[i] in features_in_dataset:
705
+ columns_to_select.append(unquoted_input_cols[i])
706
+ else:
707
+ columns_to_select.append(quoted_input_cols[i])
708
+
709
+ if len(missing_features) > 0:
710
+ raise ValueError(
711
+ "The feature names should match with those that were passed during fit.\n"
712
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
713
+ f"Features in the input dataframe : {input_cols}\n"
714
+ )
715
+ input_df = dataset[columns_to_select]
716
+ input_df.columns = features_required_by_estimator
693
717
 
694
718
  transformed_numpy_array = getattr(estimator, inference_method)(
695
719
  input_df
@@ -770,11 +794,18 @@ class StackingRegressor(BaseTransformer):
770
794
  Transformed dataset.
771
795
  """
772
796
  if isinstance(dataset, DataFrame):
797
+ expected_type_inferred = "float"
798
+ # when it is classifier, infer the datatype from label columns
799
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
800
+ expected_type_inferred = convert_sp_to_sf_type(
801
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
802
+ )
803
+
773
804
  output_df = self._batch_inference(
774
805
  dataset=dataset,
775
806
  inference_method="predict",
776
807
  expected_output_cols_list=self.output_cols,
777
- expected_output_cols_type="float",
808
+ expected_output_cols_type=expected_type_inferred,
778
809
  )
779
810
  elif isinstance(dataset, pd.DataFrame):
780
811
  output_df = self._sklearn_inference(
@@ -847,10 +878,10 @@ class StackingRegressor(BaseTransformer):
847
878
 
848
879
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
849
880
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
850
- Returns an empty list if current object is not a classifier or not yet fitted.
881
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
851
882
  """
852
883
  if getattr(self._sklearn_object, "classes_", None) is None:
853
- return []
884
+ return [output_cols_prefix]
854
885
 
855
886
  classes = self._sklearn_object.classes_
856
887
  if isinstance(classes, numpy.ndarray):
@@ -1075,7 +1106,7 @@ class StackingRegressor(BaseTransformer):
1075
1106
  cp.dump(self._sklearn_object, local_score_file)
1076
1107
 
1077
1108
  # Create temp stage to run score.
1078
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1109
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1079
1110
  session = dataset._session
1080
1111
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1081
1112
  SqlResultValidator(
@@ -1089,8 +1120,9 @@ class StackingRegressor(BaseTransformer):
1089
1120
  expected_value=f"Stage area {score_stage_name} successfully created."
1090
1121
  ).validate()
1091
1122
 
1092
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1093
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1123
+ # Use posixpath to construct stage paths
1124
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1125
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1094
1126
  statement_params = telemetry.get_function_usage_statement_params(
1095
1127
  project=_PROJECT,
1096
1128
  subproject=_SUBPROJECT,
@@ -1116,6 +1148,7 @@ class StackingRegressor(BaseTransformer):
1116
1148
  replace=True,
1117
1149
  session=session,
1118
1150
  statement_params=statement_params,
1151
+ anonymous=True
1119
1152
  )
1120
1153
  def score_wrapper_sproc(
1121
1154
  session: Session,
@@ -1123,7 +1156,8 @@ class StackingRegressor(BaseTransformer):
1123
1156
  stage_score_file_name: str,
1124
1157
  input_cols: List[str],
1125
1158
  label_cols: List[str],
1126
- sample_weight_col: Optional[str]
1159
+ sample_weight_col: Optional[str],
1160
+ statement_params: Dict[str, str]
1127
1161
  ) -> float:
1128
1162
  import cloudpickle as cp
1129
1163
  import numpy as np
@@ -1173,14 +1207,14 @@ class StackingRegressor(BaseTransformer):
1173
1207
  api_calls=[Session.call],
1174
1208
  custom_tags=dict([("autogen", True)]),
1175
1209
  )
1176
- score = session.call(
1177
- score_sproc_name,
1210
+ score = score_wrapper_sproc(
1211
+ session,
1178
1212
  query,
1179
1213
  stage_score_file_name,
1180
1214
  identifier.get_unescaped_names(self.input_cols),
1181
1215
  identifier.get_unescaped_names(self.label_cols),
1182
1216
  identifier.get_unescaped_names(self.sample_weight_col),
1183
- statement_params=statement_params,
1217
+ statement_params,
1184
1218
  )
1185
1219
 
1186
1220
  cleanup_temp_files([local_score_file_name])
@@ -1198,18 +1232,20 @@ class StackingRegressor(BaseTransformer):
1198
1232
  if self._sklearn_object._estimator_type == 'classifier':
1199
1233
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1200
1234
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1201
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1235
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1236
+ ([] if self._drop_input_cols else inputs) + outputs)
1202
1237
  # For regressor, the type of predict is float64
1203
1238
  elif self._sklearn_object._estimator_type == 'regressor':
1204
1239
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1205
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1206
-
1240
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1241
+ ([] if self._drop_input_cols else inputs) + outputs)
1207
1242
  for prob_func in PROB_FUNCTIONS:
1208
1243
  if hasattr(self, prob_func):
1209
1244
  output_cols_prefix: str = f"{prob_func}_"
1210
1245
  output_column_names = self._get_output_column_names(output_cols_prefix)
1211
1246
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1212
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1247
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1248
+ ([] if self._drop_input_cols else inputs) + outputs)
1213
1249
 
1214
1250
  @property
1215
1251
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -219,7 +221,6 @@ class VotingClassifier(BaseTransformer):
219
221
  sample_weight_col: Optional[str] = None,
220
222
  ) -> None:
221
223
  super().__init__()
222
- self.id = str(uuid4()).replace("-", "_").upper()
223
224
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
224
225
  deps = deps | _gather_dependencies(estimators)
225
226
  self._deps = list(deps)
@@ -244,6 +245,15 @@ class VotingClassifier(BaseTransformer):
244
245
  self.set_drop_input_cols(drop_input_cols)
245
246
  self.set_sample_weight_col(sample_weight_col)
246
247
 
248
+ def _get_rand_id(self) -> str:
249
+ """
250
+ Generate random id to be used in sproc and stage names.
251
+
252
+ Returns:
253
+ Random id string usable in sproc, table, and stage names.
254
+ """
255
+ return str(uuid4()).replace("-", "_").upper()
256
+
247
257
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
248
258
  """
249
259
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -322,7 +332,7 @@ class VotingClassifier(BaseTransformer):
322
332
  cp.dump(self._sklearn_object, local_transform_file)
323
333
 
324
334
  # Create temp stage to run fit.
325
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
335
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
326
336
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
327
337
  SqlResultValidator(
328
338
  session=session,
@@ -335,11 +345,12 @@ class VotingClassifier(BaseTransformer):
335
345
  expected_value=f"Stage area {transform_stage_name} successfully created."
336
346
  ).validate()
337
347
 
338
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
348
+ # Use posixpath to construct stage paths
349
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
350
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
339
351
  local_result_file_name = get_temp_file_path()
340
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
341
352
 
342
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
353
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
343
354
  statement_params = telemetry.get_function_usage_statement_params(
344
355
  project=_PROJECT,
345
356
  subproject=_SUBPROJECT,
@@ -365,6 +376,7 @@ class VotingClassifier(BaseTransformer):
365
376
  replace=True,
366
377
  session=session,
367
378
  statement_params=statement_params,
379
+ anonymous=True
368
380
  )
369
381
  def fit_wrapper_sproc(
370
382
  session: Session,
@@ -373,7 +385,8 @@ class VotingClassifier(BaseTransformer):
373
385
  stage_result_file_name: str,
374
386
  input_cols: List[str],
375
387
  label_cols: List[str],
376
- sample_weight_col: Optional[str]
388
+ sample_weight_col: Optional[str],
389
+ statement_params: Dict[str, str]
377
390
  ) -> str:
378
391
  import cloudpickle as cp
379
392
  import numpy as np
@@ -440,15 +453,15 @@ class VotingClassifier(BaseTransformer):
440
453
  api_calls=[Session.call],
441
454
  custom_tags=dict([("autogen", True)]),
442
455
  )
443
- sproc_export_file_name = session.call(
444
- fit_sproc_name,
456
+ sproc_export_file_name = fit_wrapper_sproc(
457
+ session,
445
458
  query,
446
459
  stage_transform_file_name,
447
460
  stage_result_file_name,
448
461
  identifier.get_unescaped_names(self.input_cols),
449
462
  identifier.get_unescaped_names(self.label_cols),
450
463
  identifier.get_unescaped_names(self.sample_weight_col),
451
- statement_params=statement_params,
464
+ statement_params,
452
465
  )
453
466
 
454
467
  if "|" in sproc_export_file_name:
@@ -458,7 +471,7 @@ class VotingClassifier(BaseTransformer):
458
471
  print("\n".join(fields[1:]))
459
472
 
460
473
  session.file.get(
461
- os.path.join(stage_result_file_name, sproc_export_file_name),
474
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
462
475
  local_result_file_name,
463
476
  statement_params=statement_params
464
477
  )
@@ -504,7 +517,7 @@ class VotingClassifier(BaseTransformer):
504
517
 
505
518
  # Register vectorized UDF for batch inference
506
519
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
507
- safe_id=self.id, method=inference_method)
520
+ safe_id=self._get_rand_id(), method=inference_method)
508
521
 
509
522
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
510
523
  # will try to pickle all of self which fails.
@@ -596,7 +609,7 @@ class VotingClassifier(BaseTransformer):
596
609
  return transformed_pandas_df.to_dict("records")
597
610
 
598
611
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
599
- safe_id=self.id
612
+ safe_id=self._get_rand_id()
600
613
  )
601
614
 
602
615
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -652,26 +665,37 @@ class VotingClassifier(BaseTransformer):
652
665
  # input cols need to match unquoted / quoted
653
666
  input_cols = self.input_cols
654
667
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
668
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
655
669
 
656
670
  estimator = self._sklearn_object
657
671
 
658
- input_df = dataset[input_cols] # Select input columns with quoted column names.
659
- if hasattr(estimator, "feature_names_in_"):
660
- missing_features = []
661
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
662
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
663
- missing_features.append(f)
664
-
665
- if len(missing_features) > 0:
666
- raise ValueError(
667
- "The feature names should match with those that were passed during fit.\n"
668
- f"Features seen during fit call but not present in the input: {missing_features}\n"
669
- f"Features in the input dataframe : {input_cols}\n"
670
- )
671
- input_df.columns = getattr(estimator, "feature_names_in_")
672
- else:
673
- # Just rename the column names to unquoted identifiers.
674
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
672
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
673
+ missing_features = []
674
+ features_in_dataset = set(dataset.columns)
675
+ columns_to_select = []
676
+ for i, f in enumerate(features_required_by_estimator):
677
+ if (
678
+ i >= len(input_cols)
679
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
680
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
681
+ and quoted_input_cols[i] not in features_in_dataset)
682
+ ):
683
+ missing_features.append(f)
684
+ elif input_cols[i] in features_in_dataset:
685
+ columns_to_select.append(input_cols[i])
686
+ elif unquoted_input_cols[i] in features_in_dataset:
687
+ columns_to_select.append(unquoted_input_cols[i])
688
+ else:
689
+ columns_to_select.append(quoted_input_cols[i])
690
+
691
+ if len(missing_features) > 0:
692
+ raise ValueError(
693
+ "The feature names should match with those that were passed during fit.\n"
694
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
695
+ f"Features in the input dataframe : {input_cols}\n"
696
+ )
697
+ input_df = dataset[columns_to_select]
698
+ input_df.columns = features_required_by_estimator
675
699
 
676
700
  transformed_numpy_array = getattr(estimator, inference_method)(
677
701
  input_df
@@ -752,11 +776,18 @@ class VotingClassifier(BaseTransformer):
752
776
  Transformed dataset.
753
777
  """
754
778
  if isinstance(dataset, DataFrame):
779
+ expected_type_inferred = ""
780
+ # when it is classifier, infer the datatype from label columns
781
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
782
+ expected_type_inferred = convert_sp_to_sf_type(
783
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
784
+ )
785
+
755
786
  output_df = self._batch_inference(
756
787
  dataset=dataset,
757
788
  inference_method="predict",
758
789
  expected_output_cols_list=self.output_cols,
759
- expected_output_cols_type="",
790
+ expected_output_cols_type=expected_type_inferred,
760
791
  )
761
792
  elif isinstance(dataset, pd.DataFrame):
762
793
  output_df = self._sklearn_inference(
@@ -829,10 +860,10 @@ class VotingClassifier(BaseTransformer):
829
860
 
830
861
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
831
862
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
832
- Returns an empty list if current object is not a classifier or not yet fitted.
863
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
833
864
  """
834
865
  if getattr(self._sklearn_object, "classes_", None) is None:
835
- return []
866
+ return [output_cols_prefix]
836
867
 
837
868
  classes = self._sklearn_object.classes_
838
869
  if isinstance(classes, numpy.ndarray):
@@ -1061,7 +1092,7 @@ class VotingClassifier(BaseTransformer):
1061
1092
  cp.dump(self._sklearn_object, local_score_file)
1062
1093
 
1063
1094
  # Create temp stage to run score.
1064
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1095
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1065
1096
  session = dataset._session
1066
1097
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1067
1098
  SqlResultValidator(
@@ -1075,8 +1106,9 @@ class VotingClassifier(BaseTransformer):
1075
1106
  expected_value=f"Stage area {score_stage_name} successfully created."
1076
1107
  ).validate()
1077
1108
 
1078
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1079
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1109
+ # Use posixpath to construct stage paths
1110
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1111
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1080
1112
  statement_params = telemetry.get_function_usage_statement_params(
1081
1113
  project=_PROJECT,
1082
1114
  subproject=_SUBPROJECT,
@@ -1102,6 +1134,7 @@ class VotingClassifier(BaseTransformer):
1102
1134
  replace=True,
1103
1135
  session=session,
1104
1136
  statement_params=statement_params,
1137
+ anonymous=True
1105
1138
  )
1106
1139
  def score_wrapper_sproc(
1107
1140
  session: Session,
@@ -1109,7 +1142,8 @@ class VotingClassifier(BaseTransformer):
1109
1142
  stage_score_file_name: str,
1110
1143
  input_cols: List[str],
1111
1144
  label_cols: List[str],
1112
- sample_weight_col: Optional[str]
1145
+ sample_weight_col: Optional[str],
1146
+ statement_params: Dict[str, str]
1113
1147
  ) -> float:
1114
1148
  import cloudpickle as cp
1115
1149
  import numpy as np
@@ -1159,14 +1193,14 @@ class VotingClassifier(BaseTransformer):
1159
1193
  api_calls=[Session.call],
1160
1194
  custom_tags=dict([("autogen", True)]),
1161
1195
  )
1162
- score = session.call(
1163
- score_sproc_name,
1196
+ score = score_wrapper_sproc(
1197
+ session,
1164
1198
  query,
1165
1199
  stage_score_file_name,
1166
1200
  identifier.get_unescaped_names(self.input_cols),
1167
1201
  identifier.get_unescaped_names(self.label_cols),
1168
1202
  identifier.get_unescaped_names(self.sample_weight_col),
1169
- statement_params=statement_params,
1203
+ statement_params,
1170
1204
  )
1171
1205
 
1172
1206
  cleanup_temp_files([local_score_file_name])
@@ -1184,18 +1218,20 @@ class VotingClassifier(BaseTransformer):
1184
1218
  if self._sklearn_object._estimator_type == 'classifier':
1185
1219
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1186
1220
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1187
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1221
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1222
+ ([] if self._drop_input_cols else inputs) + outputs)
1188
1223
  # For regressor, the type of predict is float64
1189
1224
  elif self._sklearn_object._estimator_type == 'regressor':
1190
1225
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1191
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1192
-
1226
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1227
+ ([] if self._drop_input_cols else inputs) + outputs)
1193
1228
  for prob_func in PROB_FUNCTIONS:
1194
1229
  if hasattr(self, prob_func):
1195
1230
  output_cols_prefix: str = f"{prob_func}_"
1196
1231
  output_column_names = self._get_output_column_names(output_cols_prefix)
1197
1232
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1198
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1233
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1234
+ ([] if self._drop_input_cols else inputs) + outputs)
1199
1235
 
1200
1236
  @property
1201
1237
  def model_signatures(self) -> Dict[str, ModelSignature]: