snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -28,6 +29,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
28
29
  from snowflake.snowpark import DataFrame, Session
29
30
  from snowflake.snowpark.functions import pandas_udf, sproc
30
31
  from snowflake.snowpark.types import PandasSeries
32
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
31
33
 
32
34
  from snowflake.ml.model.model_signature import (
33
35
  DataType,
@@ -191,7 +193,6 @@ class SelectPercentile(BaseTransformer):
191
193
  sample_weight_col: Optional[str] = None,
192
194
  ) -> None:
193
195
  super().__init__()
194
- self.id = str(uuid4()).replace("-", "_").upper()
195
196
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
196
197
 
197
198
  self._deps = list(deps)
@@ -212,6 +213,15 @@ class SelectPercentile(BaseTransformer):
212
213
  self.set_drop_input_cols(drop_input_cols)
213
214
  self.set_sample_weight_col(sample_weight_col)
214
215
 
216
+ def _get_rand_id(self) -> str:
217
+ """
218
+ Generate random id to be used in sproc and stage names.
219
+
220
+ Returns:
221
+ Random id string usable in sproc, table, and stage names.
222
+ """
223
+ return str(uuid4()).replace("-", "_").upper()
224
+
215
225
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
216
226
  """
217
227
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -290,7 +300,7 @@ class SelectPercentile(BaseTransformer):
290
300
  cp.dump(self._sklearn_object, local_transform_file)
291
301
 
292
302
  # Create temp stage to run fit.
293
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
303
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
294
304
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
295
305
  SqlResultValidator(
296
306
  session=session,
@@ -303,11 +313,12 @@ class SelectPercentile(BaseTransformer):
303
313
  expected_value=f"Stage area {transform_stage_name} successfully created."
304
314
  ).validate()
305
315
 
306
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
316
+ # Use posixpath to construct stage paths
317
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
318
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
307
319
  local_result_file_name = get_temp_file_path()
308
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
309
320
 
310
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
321
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
311
322
  statement_params = telemetry.get_function_usage_statement_params(
312
323
  project=_PROJECT,
313
324
  subproject=_SUBPROJECT,
@@ -333,6 +344,7 @@ class SelectPercentile(BaseTransformer):
333
344
  replace=True,
334
345
  session=session,
335
346
  statement_params=statement_params,
347
+ anonymous=True
336
348
  )
337
349
  def fit_wrapper_sproc(
338
350
  session: Session,
@@ -341,7 +353,8 @@ class SelectPercentile(BaseTransformer):
341
353
  stage_result_file_name: str,
342
354
  input_cols: List[str],
343
355
  label_cols: List[str],
344
- sample_weight_col: Optional[str]
356
+ sample_weight_col: Optional[str],
357
+ statement_params: Dict[str, str]
345
358
  ) -> str:
346
359
  import cloudpickle as cp
347
360
  import numpy as np
@@ -408,15 +421,15 @@ class SelectPercentile(BaseTransformer):
408
421
  api_calls=[Session.call],
409
422
  custom_tags=dict([("autogen", True)]),
410
423
  )
411
- sproc_export_file_name = session.call(
412
- fit_sproc_name,
424
+ sproc_export_file_name = fit_wrapper_sproc(
425
+ session,
413
426
  query,
414
427
  stage_transform_file_name,
415
428
  stage_result_file_name,
416
429
  identifier.get_unescaped_names(self.input_cols),
417
430
  identifier.get_unescaped_names(self.label_cols),
418
431
  identifier.get_unescaped_names(self.sample_weight_col),
419
- statement_params=statement_params,
432
+ statement_params,
420
433
  )
421
434
 
422
435
  if "|" in sproc_export_file_name:
@@ -426,7 +439,7 @@ class SelectPercentile(BaseTransformer):
426
439
  print("\n".join(fields[1:]))
427
440
 
428
441
  session.file.get(
429
- os.path.join(stage_result_file_name, sproc_export_file_name),
442
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
430
443
  local_result_file_name,
431
444
  statement_params=statement_params
432
445
  )
@@ -472,7 +485,7 @@ class SelectPercentile(BaseTransformer):
472
485
 
473
486
  # Register vectorized UDF for batch inference
474
487
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
475
- safe_id=self.id, method=inference_method)
488
+ safe_id=self._get_rand_id(), method=inference_method)
476
489
 
477
490
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
478
491
  # will try to pickle all of self which fails.
@@ -564,7 +577,7 @@ class SelectPercentile(BaseTransformer):
564
577
  return transformed_pandas_df.to_dict("records")
565
578
 
566
579
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
567
- safe_id=self.id
580
+ safe_id=self._get_rand_id()
568
581
  )
569
582
 
570
583
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -620,26 +633,37 @@ class SelectPercentile(BaseTransformer):
620
633
  # input cols need to match unquoted / quoted
621
634
  input_cols = self.input_cols
622
635
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
636
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
623
637
 
624
638
  estimator = self._sklearn_object
625
639
 
626
- input_df = dataset[input_cols] # Select input columns with quoted column names.
627
- if hasattr(estimator, "feature_names_in_"):
628
- missing_features = []
629
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
630
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
631
- missing_features.append(f)
632
-
633
- if len(missing_features) > 0:
634
- raise ValueError(
635
- "The feature names should match with those that were passed during fit.\n"
636
- f"Features seen during fit call but not present in the input: {missing_features}\n"
637
- f"Features in the input dataframe : {input_cols}\n"
638
- )
639
- input_df.columns = getattr(estimator, "feature_names_in_")
640
- else:
641
- # Just rename the column names to unquoted identifiers.
642
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
640
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
641
+ missing_features = []
642
+ features_in_dataset = set(dataset.columns)
643
+ columns_to_select = []
644
+ for i, f in enumerate(features_required_by_estimator):
645
+ if (
646
+ i >= len(input_cols)
647
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
648
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
649
+ and quoted_input_cols[i] not in features_in_dataset)
650
+ ):
651
+ missing_features.append(f)
652
+ elif input_cols[i] in features_in_dataset:
653
+ columns_to_select.append(input_cols[i])
654
+ elif unquoted_input_cols[i] in features_in_dataset:
655
+ columns_to_select.append(unquoted_input_cols[i])
656
+ else:
657
+ columns_to_select.append(quoted_input_cols[i])
658
+
659
+ if len(missing_features) > 0:
660
+ raise ValueError(
661
+ "The feature names should match with those that were passed during fit.\n"
662
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
663
+ f"Features in the input dataframe : {input_cols}\n"
664
+ )
665
+ input_df = dataset[columns_to_select]
666
+ input_df.columns = features_required_by_estimator
643
667
 
644
668
  transformed_numpy_array = getattr(estimator, inference_method)(
645
669
  input_df
@@ -718,11 +742,18 @@ class SelectPercentile(BaseTransformer):
718
742
  Transformed dataset.
719
743
  """
720
744
  if isinstance(dataset, DataFrame):
745
+ expected_type_inferred = ""
746
+ # when it is classifier, infer the datatype from label columns
747
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
748
+ expected_type_inferred = convert_sp_to_sf_type(
749
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
750
+ )
751
+
721
752
  output_df = self._batch_inference(
722
753
  dataset=dataset,
723
754
  inference_method="predict",
724
755
  expected_output_cols_list=self.output_cols,
725
- expected_output_cols_type="",
756
+ expected_output_cols_type=expected_type_inferred,
726
757
  )
727
758
  elif isinstance(dataset, pd.DataFrame):
728
759
  output_df = self._sklearn_inference(
@@ -795,10 +826,10 @@ class SelectPercentile(BaseTransformer):
795
826
 
796
827
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
797
828
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
798
- Returns an empty list if current object is not a classifier or not yet fitted.
829
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
799
830
  """
800
831
  if getattr(self._sklearn_object, "classes_", None) is None:
801
- return []
832
+ return [output_cols_prefix]
802
833
 
803
834
  classes = self._sklearn_object.classes_
804
835
  if isinstance(classes, numpy.ndarray):
@@ -1023,7 +1054,7 @@ class SelectPercentile(BaseTransformer):
1023
1054
  cp.dump(self._sklearn_object, local_score_file)
1024
1055
 
1025
1056
  # Create temp stage to run score.
1026
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1057
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1027
1058
  session = dataset._session
1028
1059
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1029
1060
  SqlResultValidator(
@@ -1037,8 +1068,9 @@ class SelectPercentile(BaseTransformer):
1037
1068
  expected_value=f"Stage area {score_stage_name} successfully created."
1038
1069
  ).validate()
1039
1070
 
1040
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1041
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1071
+ # Use posixpath to construct stage paths
1072
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1073
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1042
1074
  statement_params = telemetry.get_function_usage_statement_params(
1043
1075
  project=_PROJECT,
1044
1076
  subproject=_SUBPROJECT,
@@ -1064,6 +1096,7 @@ class SelectPercentile(BaseTransformer):
1064
1096
  replace=True,
1065
1097
  session=session,
1066
1098
  statement_params=statement_params,
1099
+ anonymous=True
1067
1100
  )
1068
1101
  def score_wrapper_sproc(
1069
1102
  session: Session,
@@ -1071,7 +1104,8 @@ class SelectPercentile(BaseTransformer):
1071
1104
  stage_score_file_name: str,
1072
1105
  input_cols: List[str],
1073
1106
  label_cols: List[str],
1074
- sample_weight_col: Optional[str]
1107
+ sample_weight_col: Optional[str],
1108
+ statement_params: Dict[str, str]
1075
1109
  ) -> float:
1076
1110
  import cloudpickle as cp
1077
1111
  import numpy as np
@@ -1121,14 +1155,14 @@ class SelectPercentile(BaseTransformer):
1121
1155
  api_calls=[Session.call],
1122
1156
  custom_tags=dict([("autogen", True)]),
1123
1157
  )
1124
- score = session.call(
1125
- score_sproc_name,
1158
+ score = score_wrapper_sproc(
1159
+ session,
1126
1160
  query,
1127
1161
  stage_score_file_name,
1128
1162
  identifier.get_unescaped_names(self.input_cols),
1129
1163
  identifier.get_unescaped_names(self.label_cols),
1130
1164
  identifier.get_unescaped_names(self.sample_weight_col),
1131
- statement_params=statement_params,
1165
+ statement_params,
1132
1166
  )
1133
1167
 
1134
1168
  cleanup_temp_files([local_score_file_name])
@@ -1146,18 +1180,20 @@ class SelectPercentile(BaseTransformer):
1146
1180
  if self._sklearn_object._estimator_type == 'classifier':
1147
1181
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1148
1182
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1149
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1183
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1184
+ ([] if self._drop_input_cols else inputs) + outputs)
1150
1185
  # For regressor, the type of predict is float64
1151
1186
  elif self._sklearn_object._estimator_type == 'regressor':
1152
1187
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1153
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1154
-
1188
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1189
+ ([] if self._drop_input_cols else inputs) + outputs)
1155
1190
  for prob_func in PROB_FUNCTIONS:
1156
1191
  if hasattr(self, prob_func):
1157
1192
  output_cols_prefix: str = f"{prob_func}_"
1158
1193
  output_column_names = self._get_output_column_names(output_cols_prefix)
1159
1194
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1160
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1195
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1196
+ ([] if self._drop_input_cols else inputs) + outputs)
1161
1197
 
1162
1198
  @property
1163
1199
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -246,7 +248,6 @@ class SequentialFeatureSelector(BaseTransformer):
246
248
  sample_weight_col: Optional[str] = None,
247
249
  ) -> None:
248
250
  super().__init__()
249
- self.id = str(uuid4()).replace("-", "_").upper()
250
251
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
251
252
  deps = deps | _gather_dependencies(estimator)
252
253
  self._deps = list(deps)
@@ -272,6 +273,15 @@ class SequentialFeatureSelector(BaseTransformer):
272
273
  self.set_drop_input_cols(drop_input_cols)
273
274
  self.set_sample_weight_col(sample_weight_col)
274
275
 
276
+ def _get_rand_id(self) -> str:
277
+ """
278
+ Generate random id to be used in sproc and stage names.
279
+
280
+ Returns:
281
+ Random id string usable in sproc, table, and stage names.
282
+ """
283
+ return str(uuid4()).replace("-", "_").upper()
284
+
275
285
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
276
286
  """
277
287
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -350,7 +360,7 @@ class SequentialFeatureSelector(BaseTransformer):
350
360
  cp.dump(self._sklearn_object, local_transform_file)
351
361
 
352
362
  # Create temp stage to run fit.
353
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
363
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
354
364
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
355
365
  SqlResultValidator(
356
366
  session=session,
@@ -363,11 +373,12 @@ class SequentialFeatureSelector(BaseTransformer):
363
373
  expected_value=f"Stage area {transform_stage_name} successfully created."
364
374
  ).validate()
365
375
 
366
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
376
+ # Use posixpath to construct stage paths
377
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
378
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
367
379
  local_result_file_name = get_temp_file_path()
368
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
369
380
 
370
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
381
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
371
382
  statement_params = telemetry.get_function_usage_statement_params(
372
383
  project=_PROJECT,
373
384
  subproject=_SUBPROJECT,
@@ -393,6 +404,7 @@ class SequentialFeatureSelector(BaseTransformer):
393
404
  replace=True,
394
405
  session=session,
395
406
  statement_params=statement_params,
407
+ anonymous=True
396
408
  )
397
409
  def fit_wrapper_sproc(
398
410
  session: Session,
@@ -401,7 +413,8 @@ class SequentialFeatureSelector(BaseTransformer):
401
413
  stage_result_file_name: str,
402
414
  input_cols: List[str],
403
415
  label_cols: List[str],
404
- sample_weight_col: Optional[str]
416
+ sample_weight_col: Optional[str],
417
+ statement_params: Dict[str, str]
405
418
  ) -> str:
406
419
  import cloudpickle as cp
407
420
  import numpy as np
@@ -468,15 +481,15 @@ class SequentialFeatureSelector(BaseTransformer):
468
481
  api_calls=[Session.call],
469
482
  custom_tags=dict([("autogen", True)]),
470
483
  )
471
- sproc_export_file_name = session.call(
472
- fit_sproc_name,
484
+ sproc_export_file_name = fit_wrapper_sproc(
485
+ session,
473
486
  query,
474
487
  stage_transform_file_name,
475
488
  stage_result_file_name,
476
489
  identifier.get_unescaped_names(self.input_cols),
477
490
  identifier.get_unescaped_names(self.label_cols),
478
491
  identifier.get_unescaped_names(self.sample_weight_col),
479
- statement_params=statement_params,
492
+ statement_params,
480
493
  )
481
494
 
482
495
  if "|" in sproc_export_file_name:
@@ -486,7 +499,7 @@ class SequentialFeatureSelector(BaseTransformer):
486
499
  print("\n".join(fields[1:]))
487
500
 
488
501
  session.file.get(
489
- os.path.join(stage_result_file_name, sproc_export_file_name),
502
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
490
503
  local_result_file_name,
491
504
  statement_params=statement_params
492
505
  )
@@ -532,7 +545,7 @@ class SequentialFeatureSelector(BaseTransformer):
532
545
 
533
546
  # Register vectorized UDF for batch inference
534
547
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
535
- safe_id=self.id, method=inference_method)
548
+ safe_id=self._get_rand_id(), method=inference_method)
536
549
 
537
550
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
538
551
  # will try to pickle all of self which fails.
@@ -624,7 +637,7 @@ class SequentialFeatureSelector(BaseTransformer):
624
637
  return transformed_pandas_df.to_dict("records")
625
638
 
626
639
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
627
- safe_id=self.id
640
+ safe_id=self._get_rand_id()
628
641
  )
629
642
 
630
643
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -680,26 +693,37 @@ class SequentialFeatureSelector(BaseTransformer):
680
693
  # input cols need to match unquoted / quoted
681
694
  input_cols = self.input_cols
682
695
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
696
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
683
697
 
684
698
  estimator = self._sklearn_object
685
699
 
686
- input_df = dataset[input_cols] # Select input columns with quoted column names.
687
- if hasattr(estimator, "feature_names_in_"):
688
- missing_features = []
689
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
690
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
691
- missing_features.append(f)
692
-
693
- if len(missing_features) > 0:
694
- raise ValueError(
695
- "The feature names should match with those that were passed during fit.\n"
696
- f"Features seen during fit call but not present in the input: {missing_features}\n"
697
- f"Features in the input dataframe : {input_cols}\n"
698
- )
699
- input_df.columns = getattr(estimator, "feature_names_in_")
700
- else:
701
- # Just rename the column names to unquoted identifiers.
702
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
700
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
701
+ missing_features = []
702
+ features_in_dataset = set(dataset.columns)
703
+ columns_to_select = []
704
+ for i, f in enumerate(features_required_by_estimator):
705
+ if (
706
+ i >= len(input_cols)
707
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
708
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
709
+ and quoted_input_cols[i] not in features_in_dataset)
710
+ ):
711
+ missing_features.append(f)
712
+ elif input_cols[i] in features_in_dataset:
713
+ columns_to_select.append(input_cols[i])
714
+ elif unquoted_input_cols[i] in features_in_dataset:
715
+ columns_to_select.append(unquoted_input_cols[i])
716
+ else:
717
+ columns_to_select.append(quoted_input_cols[i])
718
+
719
+ if len(missing_features) > 0:
720
+ raise ValueError(
721
+ "The feature names should match with those that were passed during fit.\n"
722
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
723
+ f"Features in the input dataframe : {input_cols}\n"
724
+ )
725
+ input_df = dataset[columns_to_select]
726
+ input_df.columns = features_required_by_estimator
703
727
 
704
728
  transformed_numpy_array = getattr(estimator, inference_method)(
705
729
  input_df
@@ -778,11 +802,18 @@ class SequentialFeatureSelector(BaseTransformer):
778
802
  Transformed dataset.
779
803
  """
780
804
  if isinstance(dataset, DataFrame):
805
+ expected_type_inferred = ""
806
+ # when it is classifier, infer the datatype from label columns
807
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
808
+ expected_type_inferred = convert_sp_to_sf_type(
809
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
810
+ )
811
+
781
812
  output_df = self._batch_inference(
782
813
  dataset=dataset,
783
814
  inference_method="predict",
784
815
  expected_output_cols_list=self.output_cols,
785
- expected_output_cols_type="",
816
+ expected_output_cols_type=expected_type_inferred,
786
817
  )
787
818
  elif isinstance(dataset, pd.DataFrame):
788
819
  output_df = self._sklearn_inference(
@@ -855,10 +886,10 @@ class SequentialFeatureSelector(BaseTransformer):
855
886
 
856
887
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
857
888
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
858
- Returns an empty list if current object is not a classifier or not yet fitted.
889
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
859
890
  """
860
891
  if getattr(self._sklearn_object, "classes_", None) is None:
861
- return []
892
+ return [output_cols_prefix]
862
893
 
863
894
  classes = self._sklearn_object.classes_
864
895
  if isinstance(classes, numpy.ndarray):
@@ -1083,7 +1114,7 @@ class SequentialFeatureSelector(BaseTransformer):
1083
1114
  cp.dump(self._sklearn_object, local_score_file)
1084
1115
 
1085
1116
  # Create temp stage to run score.
1086
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1117
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1087
1118
  session = dataset._session
1088
1119
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1089
1120
  SqlResultValidator(
@@ -1097,8 +1128,9 @@ class SequentialFeatureSelector(BaseTransformer):
1097
1128
  expected_value=f"Stage area {score_stage_name} successfully created."
1098
1129
  ).validate()
1099
1130
 
1100
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1101
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1131
+ # Use posixpath to construct stage paths
1132
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1133
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1102
1134
  statement_params = telemetry.get_function_usage_statement_params(
1103
1135
  project=_PROJECT,
1104
1136
  subproject=_SUBPROJECT,
@@ -1124,6 +1156,7 @@ class SequentialFeatureSelector(BaseTransformer):
1124
1156
  replace=True,
1125
1157
  session=session,
1126
1158
  statement_params=statement_params,
1159
+ anonymous=True
1127
1160
  )
1128
1161
  def score_wrapper_sproc(
1129
1162
  session: Session,
@@ -1131,7 +1164,8 @@ class SequentialFeatureSelector(BaseTransformer):
1131
1164
  stage_score_file_name: str,
1132
1165
  input_cols: List[str],
1133
1166
  label_cols: List[str],
1134
- sample_weight_col: Optional[str]
1167
+ sample_weight_col: Optional[str],
1168
+ statement_params: Dict[str, str]
1135
1169
  ) -> float:
1136
1170
  import cloudpickle as cp
1137
1171
  import numpy as np
@@ -1181,14 +1215,14 @@ class SequentialFeatureSelector(BaseTransformer):
1181
1215
  api_calls=[Session.call],
1182
1216
  custom_tags=dict([("autogen", True)]),
1183
1217
  )
1184
- score = session.call(
1185
- score_sproc_name,
1218
+ score = score_wrapper_sproc(
1219
+ session,
1186
1220
  query,
1187
1221
  stage_score_file_name,
1188
1222
  identifier.get_unescaped_names(self.input_cols),
1189
1223
  identifier.get_unescaped_names(self.label_cols),
1190
1224
  identifier.get_unescaped_names(self.sample_weight_col),
1191
- statement_params=statement_params,
1225
+ statement_params,
1192
1226
  )
1193
1227
 
1194
1228
  cleanup_temp_files([local_score_file_name])
@@ -1206,18 +1240,20 @@ class SequentialFeatureSelector(BaseTransformer):
1206
1240
  if self._sklearn_object._estimator_type == 'classifier':
1207
1241
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1208
1242
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1209
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1243
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1244
+ ([] if self._drop_input_cols else inputs) + outputs)
1210
1245
  # For regressor, the type of predict is float64
1211
1246
  elif self._sklearn_object._estimator_type == 'regressor':
1212
1247
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1213
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1214
-
1248
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1249
+ ([] if self._drop_input_cols else inputs) + outputs)
1215
1250
  for prob_func in PROB_FUNCTIONS:
1216
1251
  if hasattr(self, prob_func):
1217
1252
  output_cols_prefix: str = f"{prob_func}_"
1218
1253
  output_column_names = self._get_output_column_names(output_cols_prefix)
1219
1254
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1220
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1255
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1256
+ ([] if self._drop_input_cols else inputs) + outputs)
1221
1257
 
1222
1258
  @property
1223
1259
  def model_signatures(self) -> Dict[str, ModelSignature]: