snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -203,7 +205,6 @@ class VotingRegressor(BaseTransformer):
203
205
  sample_weight_col: Optional[str] = None,
204
206
  ) -> None:
205
207
  super().__init__()
206
- self.id = str(uuid4()).replace("-", "_").upper()
207
208
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
208
209
  deps = deps | _gather_dependencies(estimators)
209
210
  self._deps = list(deps)
@@ -226,6 +227,15 @@ class VotingRegressor(BaseTransformer):
226
227
  self.set_drop_input_cols(drop_input_cols)
227
228
  self.set_sample_weight_col(sample_weight_col)
228
229
 
230
+ def _get_rand_id(self) -> str:
231
+ """
232
+ Generate random id to be used in sproc and stage names.
233
+
234
+ Returns:
235
+ Random id string usable in sproc, table, and stage names.
236
+ """
237
+ return str(uuid4()).replace("-", "_").upper()
238
+
229
239
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
230
240
  """
231
241
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -304,7 +314,7 @@ class VotingRegressor(BaseTransformer):
304
314
  cp.dump(self._sklearn_object, local_transform_file)
305
315
 
306
316
  # Create temp stage to run fit.
307
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
317
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
308
318
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
309
319
  SqlResultValidator(
310
320
  session=session,
@@ -317,11 +327,12 @@ class VotingRegressor(BaseTransformer):
317
327
  expected_value=f"Stage area {transform_stage_name} successfully created."
318
328
  ).validate()
319
329
 
320
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
330
+ # Use posixpath to construct stage paths
331
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
332
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
321
333
  local_result_file_name = get_temp_file_path()
322
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
323
334
 
324
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
335
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
325
336
  statement_params = telemetry.get_function_usage_statement_params(
326
337
  project=_PROJECT,
327
338
  subproject=_SUBPROJECT,
@@ -347,6 +358,7 @@ class VotingRegressor(BaseTransformer):
347
358
  replace=True,
348
359
  session=session,
349
360
  statement_params=statement_params,
361
+ anonymous=True
350
362
  )
351
363
  def fit_wrapper_sproc(
352
364
  session: Session,
@@ -355,7 +367,8 @@ class VotingRegressor(BaseTransformer):
355
367
  stage_result_file_name: str,
356
368
  input_cols: List[str],
357
369
  label_cols: List[str],
358
- sample_weight_col: Optional[str]
370
+ sample_weight_col: Optional[str],
371
+ statement_params: Dict[str, str]
359
372
  ) -> str:
360
373
  import cloudpickle as cp
361
374
  import numpy as np
@@ -422,15 +435,15 @@ class VotingRegressor(BaseTransformer):
422
435
  api_calls=[Session.call],
423
436
  custom_tags=dict([("autogen", True)]),
424
437
  )
425
- sproc_export_file_name = session.call(
426
- fit_sproc_name,
438
+ sproc_export_file_name = fit_wrapper_sproc(
439
+ session,
427
440
  query,
428
441
  stage_transform_file_name,
429
442
  stage_result_file_name,
430
443
  identifier.get_unescaped_names(self.input_cols),
431
444
  identifier.get_unescaped_names(self.label_cols),
432
445
  identifier.get_unescaped_names(self.sample_weight_col),
433
- statement_params=statement_params,
446
+ statement_params,
434
447
  )
435
448
 
436
449
  if "|" in sproc_export_file_name:
@@ -440,7 +453,7 @@ class VotingRegressor(BaseTransformer):
440
453
  print("\n".join(fields[1:]))
441
454
 
442
455
  session.file.get(
443
- os.path.join(stage_result_file_name, sproc_export_file_name),
456
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
444
457
  local_result_file_name,
445
458
  statement_params=statement_params
446
459
  )
@@ -486,7 +499,7 @@ class VotingRegressor(BaseTransformer):
486
499
 
487
500
  # Register vectorized UDF for batch inference
488
501
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
489
- safe_id=self.id, method=inference_method)
502
+ safe_id=self._get_rand_id(), method=inference_method)
490
503
 
491
504
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
492
505
  # will try to pickle all of self which fails.
@@ -578,7 +591,7 @@ class VotingRegressor(BaseTransformer):
578
591
  return transformed_pandas_df.to_dict("records")
579
592
 
580
593
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
581
- safe_id=self.id
594
+ safe_id=self._get_rand_id()
582
595
  )
583
596
 
584
597
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -634,26 +647,37 @@ class VotingRegressor(BaseTransformer):
634
647
  # input cols need to match unquoted / quoted
635
648
  input_cols = self.input_cols
636
649
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
650
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
637
651
 
638
652
  estimator = self._sklearn_object
639
653
 
640
- input_df = dataset[input_cols] # Select input columns with quoted column names.
641
- if hasattr(estimator, "feature_names_in_"):
642
- missing_features = []
643
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
644
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
645
- missing_features.append(f)
646
-
647
- if len(missing_features) > 0:
648
- raise ValueError(
649
- "The feature names should match with those that were passed during fit.\n"
650
- f"Features seen during fit call but not present in the input: {missing_features}\n"
651
- f"Features in the input dataframe : {input_cols}\n"
652
- )
653
- input_df.columns = getattr(estimator, "feature_names_in_")
654
- else:
655
- # Just rename the column names to unquoted identifiers.
656
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
654
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
655
+ missing_features = []
656
+ features_in_dataset = set(dataset.columns)
657
+ columns_to_select = []
658
+ for i, f in enumerate(features_required_by_estimator):
659
+ if (
660
+ i >= len(input_cols)
661
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
662
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
663
+ and quoted_input_cols[i] not in features_in_dataset)
664
+ ):
665
+ missing_features.append(f)
666
+ elif input_cols[i] in features_in_dataset:
667
+ columns_to_select.append(input_cols[i])
668
+ elif unquoted_input_cols[i] in features_in_dataset:
669
+ columns_to_select.append(unquoted_input_cols[i])
670
+ else:
671
+ columns_to_select.append(quoted_input_cols[i])
672
+
673
+ if len(missing_features) > 0:
674
+ raise ValueError(
675
+ "The feature names should match with those that were passed during fit.\n"
676
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
677
+ f"Features in the input dataframe : {input_cols}\n"
678
+ )
679
+ input_df = dataset[columns_to_select]
680
+ input_df.columns = features_required_by_estimator
657
681
 
658
682
  transformed_numpy_array = getattr(estimator, inference_method)(
659
683
  input_df
@@ -734,11 +758,18 @@ class VotingRegressor(BaseTransformer):
734
758
  Transformed dataset.
735
759
  """
736
760
  if isinstance(dataset, DataFrame):
761
+ expected_type_inferred = "float"
762
+ # when it is classifier, infer the datatype from label columns
763
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
764
+ expected_type_inferred = convert_sp_to_sf_type(
765
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
766
+ )
767
+
737
768
  output_df = self._batch_inference(
738
769
  dataset=dataset,
739
770
  inference_method="predict",
740
771
  expected_output_cols_list=self.output_cols,
741
- expected_output_cols_type="float",
772
+ expected_output_cols_type=expected_type_inferred,
742
773
  )
743
774
  elif isinstance(dataset, pd.DataFrame):
744
775
  output_df = self._sklearn_inference(
@@ -811,10 +842,10 @@ class VotingRegressor(BaseTransformer):
811
842
 
812
843
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
813
844
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
814
- Returns an empty list if current object is not a classifier or not yet fitted.
845
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
815
846
  """
816
847
  if getattr(self._sklearn_object, "classes_", None) is None:
817
- return []
848
+ return [output_cols_prefix]
818
849
 
819
850
  classes = self._sklearn_object.classes_
820
851
  if isinstance(classes, numpy.ndarray):
@@ -1039,7 +1070,7 @@ class VotingRegressor(BaseTransformer):
1039
1070
  cp.dump(self._sklearn_object, local_score_file)
1040
1071
 
1041
1072
  # Create temp stage to run score.
1042
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1073
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1043
1074
  session = dataset._session
1044
1075
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1045
1076
  SqlResultValidator(
@@ -1053,8 +1084,9 @@ class VotingRegressor(BaseTransformer):
1053
1084
  expected_value=f"Stage area {score_stage_name} successfully created."
1054
1085
  ).validate()
1055
1086
 
1056
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1057
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1087
+ # Use posixpath to construct stage paths
1088
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1089
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1058
1090
  statement_params = telemetry.get_function_usage_statement_params(
1059
1091
  project=_PROJECT,
1060
1092
  subproject=_SUBPROJECT,
@@ -1080,6 +1112,7 @@ class VotingRegressor(BaseTransformer):
1080
1112
  replace=True,
1081
1113
  session=session,
1082
1114
  statement_params=statement_params,
1115
+ anonymous=True
1083
1116
  )
1084
1117
  def score_wrapper_sproc(
1085
1118
  session: Session,
@@ -1087,7 +1120,8 @@ class VotingRegressor(BaseTransformer):
1087
1120
  stage_score_file_name: str,
1088
1121
  input_cols: List[str],
1089
1122
  label_cols: List[str],
1090
- sample_weight_col: Optional[str]
1123
+ sample_weight_col: Optional[str],
1124
+ statement_params: Dict[str, str]
1091
1125
  ) -> float:
1092
1126
  import cloudpickle as cp
1093
1127
  import numpy as np
@@ -1137,14 +1171,14 @@ class VotingRegressor(BaseTransformer):
1137
1171
  api_calls=[Session.call],
1138
1172
  custom_tags=dict([("autogen", True)]),
1139
1173
  )
1140
- score = session.call(
1141
- score_sproc_name,
1174
+ score = score_wrapper_sproc(
1175
+ session,
1142
1176
  query,
1143
1177
  stage_score_file_name,
1144
1178
  identifier.get_unescaped_names(self.input_cols),
1145
1179
  identifier.get_unescaped_names(self.label_cols),
1146
1180
  identifier.get_unescaped_names(self.sample_weight_col),
1147
- statement_params=statement_params,
1181
+ statement_params,
1148
1182
  )
1149
1183
 
1150
1184
  cleanup_temp_files([local_score_file_name])
@@ -1162,18 +1196,20 @@ class VotingRegressor(BaseTransformer):
1162
1196
  if self._sklearn_object._estimator_type == 'classifier':
1163
1197
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1164
1198
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1165
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1199
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1200
+ ([] if self._drop_input_cols else inputs) + outputs)
1166
1201
  # For regressor, the type of predict is float64
1167
1202
  elif self._sklearn_object._estimator_type == 'regressor':
1168
1203
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1169
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1170
-
1204
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1205
+ ([] if self._drop_input_cols else inputs) + outputs)
1171
1206
  for prob_func in PROB_FUNCTIONS:
1172
1207
  if hasattr(self, prob_func):
1173
1208
  output_cols_prefix: str = f"{prob_func}_"
1174
1209
  output_column_names = self._get_output_column_names(output_cols_prefix)
1175
1210
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1176
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1211
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1212
+ ([] if self._drop_input_cols else inputs) + outputs)
1177
1213
 
1178
1214
  @property
1179
1215
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -28,6 +29,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
28
29
  from snowflake.snowpark import DataFrame, Session
29
30
  from snowflake.snowpark.functions import pandas_udf, sproc
30
31
  from snowflake.snowpark.types import PandasSeries
32
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
31
33
 
32
34
  from snowflake.ml.model.model_signature import (
33
35
  DataType,
@@ -194,7 +196,6 @@ class GenericUnivariateSelect(BaseTransformer):
194
196
  sample_weight_col: Optional[str] = None,
195
197
  ) -> None:
196
198
  super().__init__()
197
- self.id = str(uuid4()).replace("-", "_").upper()
198
199
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
199
200
 
200
201
  self._deps = list(deps)
@@ -216,6 +217,15 @@ class GenericUnivariateSelect(BaseTransformer):
216
217
  self.set_drop_input_cols(drop_input_cols)
217
218
  self.set_sample_weight_col(sample_weight_col)
218
219
 
220
+ def _get_rand_id(self) -> str:
221
+ """
222
+ Generate random id to be used in sproc and stage names.
223
+
224
+ Returns:
225
+ Random id string usable in sproc, table, and stage names.
226
+ """
227
+ return str(uuid4()).replace("-", "_").upper()
228
+
219
229
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
220
230
  """
221
231
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -294,7 +304,7 @@ class GenericUnivariateSelect(BaseTransformer):
294
304
  cp.dump(self._sklearn_object, local_transform_file)
295
305
 
296
306
  # Create temp stage to run fit.
297
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
307
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
298
308
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
299
309
  SqlResultValidator(
300
310
  session=session,
@@ -307,11 +317,12 @@ class GenericUnivariateSelect(BaseTransformer):
307
317
  expected_value=f"Stage area {transform_stage_name} successfully created."
308
318
  ).validate()
309
319
 
310
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
320
+ # Use posixpath to construct stage paths
321
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
322
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
311
323
  local_result_file_name = get_temp_file_path()
312
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
313
324
 
314
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
325
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
315
326
  statement_params = telemetry.get_function_usage_statement_params(
316
327
  project=_PROJECT,
317
328
  subproject=_SUBPROJECT,
@@ -337,6 +348,7 @@ class GenericUnivariateSelect(BaseTransformer):
337
348
  replace=True,
338
349
  session=session,
339
350
  statement_params=statement_params,
351
+ anonymous=True
340
352
  )
341
353
  def fit_wrapper_sproc(
342
354
  session: Session,
@@ -345,7 +357,8 @@ class GenericUnivariateSelect(BaseTransformer):
345
357
  stage_result_file_name: str,
346
358
  input_cols: List[str],
347
359
  label_cols: List[str],
348
- sample_weight_col: Optional[str]
360
+ sample_weight_col: Optional[str],
361
+ statement_params: Dict[str, str]
349
362
  ) -> str:
350
363
  import cloudpickle as cp
351
364
  import numpy as np
@@ -412,15 +425,15 @@ class GenericUnivariateSelect(BaseTransformer):
412
425
  api_calls=[Session.call],
413
426
  custom_tags=dict([("autogen", True)]),
414
427
  )
415
- sproc_export_file_name = session.call(
416
- fit_sproc_name,
428
+ sproc_export_file_name = fit_wrapper_sproc(
429
+ session,
417
430
  query,
418
431
  stage_transform_file_name,
419
432
  stage_result_file_name,
420
433
  identifier.get_unescaped_names(self.input_cols),
421
434
  identifier.get_unescaped_names(self.label_cols),
422
435
  identifier.get_unescaped_names(self.sample_weight_col),
423
- statement_params=statement_params,
436
+ statement_params,
424
437
  )
425
438
 
426
439
  if "|" in sproc_export_file_name:
@@ -430,7 +443,7 @@ class GenericUnivariateSelect(BaseTransformer):
430
443
  print("\n".join(fields[1:]))
431
444
 
432
445
  session.file.get(
433
- os.path.join(stage_result_file_name, sproc_export_file_name),
446
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
434
447
  local_result_file_name,
435
448
  statement_params=statement_params
436
449
  )
@@ -476,7 +489,7 @@ class GenericUnivariateSelect(BaseTransformer):
476
489
 
477
490
  # Register vectorized UDF for batch inference
478
491
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
479
- safe_id=self.id, method=inference_method)
492
+ safe_id=self._get_rand_id(), method=inference_method)
480
493
 
481
494
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
482
495
  # will try to pickle all of self which fails.
@@ -568,7 +581,7 @@ class GenericUnivariateSelect(BaseTransformer):
568
581
  return transformed_pandas_df.to_dict("records")
569
582
 
570
583
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
571
- safe_id=self.id
584
+ safe_id=self._get_rand_id()
572
585
  )
573
586
 
574
587
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -624,26 +637,37 @@ class GenericUnivariateSelect(BaseTransformer):
624
637
  # input cols need to match unquoted / quoted
625
638
  input_cols = self.input_cols
626
639
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
640
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
627
641
 
628
642
  estimator = self._sklearn_object
629
643
 
630
- input_df = dataset[input_cols] # Select input columns with quoted column names.
631
- if hasattr(estimator, "feature_names_in_"):
632
- missing_features = []
633
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
634
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
635
- missing_features.append(f)
636
-
637
- if len(missing_features) > 0:
638
- raise ValueError(
639
- "The feature names should match with those that were passed during fit.\n"
640
- f"Features seen during fit call but not present in the input: {missing_features}\n"
641
- f"Features in the input dataframe : {input_cols}\n"
642
- )
643
- input_df.columns = getattr(estimator, "feature_names_in_")
644
- else:
645
- # Just rename the column names to unquoted identifiers.
646
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
644
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
645
+ missing_features = []
646
+ features_in_dataset = set(dataset.columns)
647
+ columns_to_select = []
648
+ for i, f in enumerate(features_required_by_estimator):
649
+ if (
650
+ i >= len(input_cols)
651
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
652
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
653
+ and quoted_input_cols[i] not in features_in_dataset)
654
+ ):
655
+ missing_features.append(f)
656
+ elif input_cols[i] in features_in_dataset:
657
+ columns_to_select.append(input_cols[i])
658
+ elif unquoted_input_cols[i] in features_in_dataset:
659
+ columns_to_select.append(unquoted_input_cols[i])
660
+ else:
661
+ columns_to_select.append(quoted_input_cols[i])
662
+
663
+ if len(missing_features) > 0:
664
+ raise ValueError(
665
+ "The feature names should match with those that were passed during fit.\n"
666
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
667
+ f"Features in the input dataframe : {input_cols}\n"
668
+ )
669
+ input_df = dataset[columns_to_select]
670
+ input_df.columns = features_required_by_estimator
647
671
 
648
672
  transformed_numpy_array = getattr(estimator, inference_method)(
649
673
  input_df
@@ -722,11 +746,18 @@ class GenericUnivariateSelect(BaseTransformer):
722
746
  Transformed dataset.
723
747
  """
724
748
  if isinstance(dataset, DataFrame):
749
+ expected_type_inferred = ""
750
+ # when it is classifier, infer the datatype from label columns
751
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
752
+ expected_type_inferred = convert_sp_to_sf_type(
753
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
754
+ )
755
+
725
756
  output_df = self._batch_inference(
726
757
  dataset=dataset,
727
758
  inference_method="predict",
728
759
  expected_output_cols_list=self.output_cols,
729
- expected_output_cols_type="",
760
+ expected_output_cols_type=expected_type_inferred,
730
761
  )
731
762
  elif isinstance(dataset, pd.DataFrame):
732
763
  output_df = self._sklearn_inference(
@@ -799,10 +830,10 @@ class GenericUnivariateSelect(BaseTransformer):
799
830
 
800
831
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
801
832
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
802
- Returns an empty list if current object is not a classifier or not yet fitted.
833
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
803
834
  """
804
835
  if getattr(self._sklearn_object, "classes_", None) is None:
805
- return []
836
+ return [output_cols_prefix]
806
837
 
807
838
  classes = self._sklearn_object.classes_
808
839
  if isinstance(classes, numpy.ndarray):
@@ -1027,7 +1058,7 @@ class GenericUnivariateSelect(BaseTransformer):
1027
1058
  cp.dump(self._sklearn_object, local_score_file)
1028
1059
 
1029
1060
  # Create temp stage to run score.
1030
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1061
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1031
1062
  session = dataset._session
1032
1063
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1033
1064
  SqlResultValidator(
@@ -1041,8 +1072,9 @@ class GenericUnivariateSelect(BaseTransformer):
1041
1072
  expected_value=f"Stage area {score_stage_name} successfully created."
1042
1073
  ).validate()
1043
1074
 
1044
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1045
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1075
+ # Use posixpath to construct stage paths
1076
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1077
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1046
1078
  statement_params = telemetry.get_function_usage_statement_params(
1047
1079
  project=_PROJECT,
1048
1080
  subproject=_SUBPROJECT,
@@ -1068,6 +1100,7 @@ class GenericUnivariateSelect(BaseTransformer):
1068
1100
  replace=True,
1069
1101
  session=session,
1070
1102
  statement_params=statement_params,
1103
+ anonymous=True
1071
1104
  )
1072
1105
  def score_wrapper_sproc(
1073
1106
  session: Session,
@@ -1075,7 +1108,8 @@ class GenericUnivariateSelect(BaseTransformer):
1075
1108
  stage_score_file_name: str,
1076
1109
  input_cols: List[str],
1077
1110
  label_cols: List[str],
1078
- sample_weight_col: Optional[str]
1111
+ sample_weight_col: Optional[str],
1112
+ statement_params: Dict[str, str]
1079
1113
  ) -> float:
1080
1114
  import cloudpickle as cp
1081
1115
  import numpy as np
@@ -1125,14 +1159,14 @@ class GenericUnivariateSelect(BaseTransformer):
1125
1159
  api_calls=[Session.call],
1126
1160
  custom_tags=dict([("autogen", True)]),
1127
1161
  )
1128
- score = session.call(
1129
- score_sproc_name,
1162
+ score = score_wrapper_sproc(
1163
+ session,
1130
1164
  query,
1131
1165
  stage_score_file_name,
1132
1166
  identifier.get_unescaped_names(self.input_cols),
1133
1167
  identifier.get_unescaped_names(self.label_cols),
1134
1168
  identifier.get_unescaped_names(self.sample_weight_col),
1135
- statement_params=statement_params,
1169
+ statement_params,
1136
1170
  )
1137
1171
 
1138
1172
  cleanup_temp_files([local_score_file_name])
@@ -1150,18 +1184,20 @@ class GenericUnivariateSelect(BaseTransformer):
1150
1184
  if self._sklearn_object._estimator_type == 'classifier':
1151
1185
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1152
1186
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1153
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1187
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1188
+ ([] if self._drop_input_cols else inputs) + outputs)
1154
1189
  # For regressor, the type of predict is float64
1155
1190
  elif self._sklearn_object._estimator_type == 'regressor':
1156
1191
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1157
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1158
-
1192
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1193
+ ([] if self._drop_input_cols else inputs) + outputs)
1159
1194
  for prob_func in PROB_FUNCTIONS:
1160
1195
  if hasattr(self, prob_func):
1161
1196
  output_cols_prefix: str = f"{prob_func}_"
1162
1197
  output_column_names = self._get_output_column_names(output_cols_prefix)
1163
1198
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1164
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1199
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1200
+ ([] if self._drop_input_cols else inputs) + outputs)
1165
1201
 
1166
1202
  @property
1167
1203
  def model_signatures(self) -> Dict[str, ModelSignature]: