snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -207,7 +209,6 @@ class IncrementalPCA(BaseTransformer):
207
209
  sample_weight_col: Optional[str] = None,
208
210
  ) -> None:
209
211
  super().__init__()
210
- self.id = str(uuid4()).replace("-", "_").upper()
211
212
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
212
213
 
213
214
  self._deps = list(deps)
@@ -230,6 +231,15 @@ class IncrementalPCA(BaseTransformer):
230
231
  self.set_drop_input_cols(drop_input_cols)
231
232
  self.set_sample_weight_col(sample_weight_col)
232
233
 
234
+ def _get_rand_id(self) -> str:
235
+ """
236
+ Generate random id to be used in sproc and stage names.
237
+
238
+ Returns:
239
+ Random id string usable in sproc, table, and stage names.
240
+ """
241
+ return str(uuid4()).replace("-", "_").upper()
242
+
233
243
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
234
244
  """
235
245
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -308,7 +318,7 @@ class IncrementalPCA(BaseTransformer):
308
318
  cp.dump(self._sklearn_object, local_transform_file)
309
319
 
310
320
  # Create temp stage to run fit.
311
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
321
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
312
322
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
313
323
  SqlResultValidator(
314
324
  session=session,
@@ -321,11 +331,12 @@ class IncrementalPCA(BaseTransformer):
321
331
  expected_value=f"Stage area {transform_stage_name} successfully created."
322
332
  ).validate()
323
333
 
324
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
334
+ # Use posixpath to construct stage paths
335
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
336
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
325
337
  local_result_file_name = get_temp_file_path()
326
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
327
338
 
328
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
339
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
329
340
  statement_params = telemetry.get_function_usage_statement_params(
330
341
  project=_PROJECT,
331
342
  subproject=_SUBPROJECT,
@@ -351,6 +362,7 @@ class IncrementalPCA(BaseTransformer):
351
362
  replace=True,
352
363
  session=session,
353
364
  statement_params=statement_params,
365
+ anonymous=True
354
366
  )
355
367
  def fit_wrapper_sproc(
356
368
  session: Session,
@@ -359,7 +371,8 @@ class IncrementalPCA(BaseTransformer):
359
371
  stage_result_file_name: str,
360
372
  input_cols: List[str],
361
373
  label_cols: List[str],
362
- sample_weight_col: Optional[str]
374
+ sample_weight_col: Optional[str],
375
+ statement_params: Dict[str, str]
363
376
  ) -> str:
364
377
  import cloudpickle as cp
365
378
  import numpy as np
@@ -426,15 +439,15 @@ class IncrementalPCA(BaseTransformer):
426
439
  api_calls=[Session.call],
427
440
  custom_tags=dict([("autogen", True)]),
428
441
  )
429
- sproc_export_file_name = session.call(
430
- fit_sproc_name,
442
+ sproc_export_file_name = fit_wrapper_sproc(
443
+ session,
431
444
  query,
432
445
  stage_transform_file_name,
433
446
  stage_result_file_name,
434
447
  identifier.get_unescaped_names(self.input_cols),
435
448
  identifier.get_unescaped_names(self.label_cols),
436
449
  identifier.get_unescaped_names(self.sample_weight_col),
437
- statement_params=statement_params,
450
+ statement_params,
438
451
  )
439
452
 
440
453
  if "|" in sproc_export_file_name:
@@ -444,7 +457,7 @@ class IncrementalPCA(BaseTransformer):
444
457
  print("\n".join(fields[1:]))
445
458
 
446
459
  session.file.get(
447
- os.path.join(stage_result_file_name, sproc_export_file_name),
460
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
448
461
  local_result_file_name,
449
462
  statement_params=statement_params
450
463
  )
@@ -490,7 +503,7 @@ class IncrementalPCA(BaseTransformer):
490
503
 
491
504
  # Register vectorized UDF for batch inference
492
505
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
493
- safe_id=self.id, method=inference_method)
506
+ safe_id=self._get_rand_id(), method=inference_method)
494
507
 
495
508
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
496
509
  # will try to pickle all of self which fails.
@@ -582,7 +595,7 @@ class IncrementalPCA(BaseTransformer):
582
595
  return transformed_pandas_df.to_dict("records")
583
596
 
584
597
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
585
- safe_id=self.id
598
+ safe_id=self._get_rand_id()
586
599
  )
587
600
 
588
601
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -638,26 +651,37 @@ class IncrementalPCA(BaseTransformer):
638
651
  # input cols need to match unquoted / quoted
639
652
  input_cols = self.input_cols
640
653
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
654
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
641
655
 
642
656
  estimator = self._sklearn_object
643
657
 
644
- input_df = dataset[input_cols] # Select input columns with quoted column names.
645
- if hasattr(estimator, "feature_names_in_"):
646
- missing_features = []
647
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
648
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
649
- missing_features.append(f)
650
-
651
- if len(missing_features) > 0:
652
- raise ValueError(
653
- "The feature names should match with those that were passed during fit.\n"
654
- f"Features seen during fit call but not present in the input: {missing_features}\n"
655
- f"Features in the input dataframe : {input_cols}\n"
656
- )
657
- input_df.columns = getattr(estimator, "feature_names_in_")
658
- else:
659
- # Just rename the column names to unquoted identifiers.
660
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
658
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
659
+ missing_features = []
660
+ features_in_dataset = set(dataset.columns)
661
+ columns_to_select = []
662
+ for i, f in enumerate(features_required_by_estimator):
663
+ if (
664
+ i >= len(input_cols)
665
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
666
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
667
+ and quoted_input_cols[i] not in features_in_dataset)
668
+ ):
669
+ missing_features.append(f)
670
+ elif input_cols[i] in features_in_dataset:
671
+ columns_to_select.append(input_cols[i])
672
+ elif unquoted_input_cols[i] in features_in_dataset:
673
+ columns_to_select.append(unquoted_input_cols[i])
674
+ else:
675
+ columns_to_select.append(quoted_input_cols[i])
676
+
677
+ if len(missing_features) > 0:
678
+ raise ValueError(
679
+ "The feature names should match with those that were passed during fit.\n"
680
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
681
+ f"Features in the input dataframe : {input_cols}\n"
682
+ )
683
+ input_df = dataset[columns_to_select]
684
+ input_df.columns = features_required_by_estimator
661
685
 
662
686
  transformed_numpy_array = getattr(estimator, inference_method)(
663
687
  input_df
@@ -736,11 +760,18 @@ class IncrementalPCA(BaseTransformer):
736
760
  Transformed dataset.
737
761
  """
738
762
  if isinstance(dataset, DataFrame):
763
+ expected_type_inferred = ""
764
+ # when it is classifier, infer the datatype from label columns
765
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
766
+ expected_type_inferred = convert_sp_to_sf_type(
767
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
768
+ )
769
+
739
770
  output_df = self._batch_inference(
740
771
  dataset=dataset,
741
772
  inference_method="predict",
742
773
  expected_output_cols_list=self.output_cols,
743
- expected_output_cols_type="",
774
+ expected_output_cols_type=expected_type_inferred,
744
775
  )
745
776
  elif isinstance(dataset, pd.DataFrame):
746
777
  output_df = self._sklearn_inference(
@@ -813,10 +844,10 @@ class IncrementalPCA(BaseTransformer):
813
844
 
814
845
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
815
846
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
816
- Returns an empty list if current object is not a classifier or not yet fitted.
847
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
817
848
  """
818
849
  if getattr(self._sklearn_object, "classes_", None) is None:
819
- return []
850
+ return [output_cols_prefix]
820
851
 
821
852
  classes = self._sklearn_object.classes_
822
853
  if isinstance(classes, numpy.ndarray):
@@ -1041,7 +1072,7 @@ class IncrementalPCA(BaseTransformer):
1041
1072
  cp.dump(self._sklearn_object, local_score_file)
1042
1073
 
1043
1074
  # Create temp stage to run score.
1044
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1075
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1045
1076
  session = dataset._session
1046
1077
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1047
1078
  SqlResultValidator(
@@ -1055,8 +1086,9 @@ class IncrementalPCA(BaseTransformer):
1055
1086
  expected_value=f"Stage area {score_stage_name} successfully created."
1056
1087
  ).validate()
1057
1088
 
1058
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1059
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1089
+ # Use posixpath to construct stage paths
1090
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1091
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1060
1092
  statement_params = telemetry.get_function_usage_statement_params(
1061
1093
  project=_PROJECT,
1062
1094
  subproject=_SUBPROJECT,
@@ -1082,6 +1114,7 @@ class IncrementalPCA(BaseTransformer):
1082
1114
  replace=True,
1083
1115
  session=session,
1084
1116
  statement_params=statement_params,
1117
+ anonymous=True
1085
1118
  )
1086
1119
  def score_wrapper_sproc(
1087
1120
  session: Session,
@@ -1089,7 +1122,8 @@ class IncrementalPCA(BaseTransformer):
1089
1122
  stage_score_file_name: str,
1090
1123
  input_cols: List[str],
1091
1124
  label_cols: List[str],
1092
- sample_weight_col: Optional[str]
1125
+ sample_weight_col: Optional[str],
1126
+ statement_params: Dict[str, str]
1093
1127
  ) -> float:
1094
1128
  import cloudpickle as cp
1095
1129
  import numpy as np
@@ -1139,14 +1173,14 @@ class IncrementalPCA(BaseTransformer):
1139
1173
  api_calls=[Session.call],
1140
1174
  custom_tags=dict([("autogen", True)]),
1141
1175
  )
1142
- score = session.call(
1143
- score_sproc_name,
1176
+ score = score_wrapper_sproc(
1177
+ session,
1144
1178
  query,
1145
1179
  stage_score_file_name,
1146
1180
  identifier.get_unescaped_names(self.input_cols),
1147
1181
  identifier.get_unescaped_names(self.label_cols),
1148
1182
  identifier.get_unescaped_names(self.sample_weight_col),
1149
- statement_params=statement_params,
1183
+ statement_params,
1150
1184
  )
1151
1185
 
1152
1186
  cleanup_temp_files([local_score_file_name])
@@ -1164,18 +1198,20 @@ class IncrementalPCA(BaseTransformer):
1164
1198
  if self._sklearn_object._estimator_type == 'classifier':
1165
1199
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1166
1200
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1167
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1201
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1202
+ ([] if self._drop_input_cols else inputs) + outputs)
1168
1203
  # For regressor, the type of predict is float64
1169
1204
  elif self._sklearn_object._estimator_type == 'regressor':
1170
1205
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1171
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1172
-
1206
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1207
+ ([] if self._drop_input_cols else inputs) + outputs)
1173
1208
  for prob_func in PROB_FUNCTIONS:
1174
1209
  if hasattr(self, prob_func):
1175
1210
  output_cols_prefix: str = f"{prob_func}_"
1176
1211
  output_column_names = self._get_output_column_names(output_cols_prefix)
1177
1212
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1178
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1213
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1214
+ ([] if self._drop_input_cols else inputs) + outputs)
1179
1215
 
1180
1216
  @property
1181
1217
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -291,7 +293,6 @@ class KernelPCA(BaseTransformer):
291
293
  sample_weight_col: Optional[str] = None,
292
294
  ) -> None:
293
295
  super().__init__()
294
- self.id = str(uuid4()).replace("-", "_").upper()
295
296
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
296
297
 
297
298
  self._deps = list(deps)
@@ -326,6 +327,15 @@ class KernelPCA(BaseTransformer):
326
327
  self.set_drop_input_cols(drop_input_cols)
327
328
  self.set_sample_weight_col(sample_weight_col)
328
329
 
330
+ def _get_rand_id(self) -> str:
331
+ """
332
+ Generate random id to be used in sproc and stage names.
333
+
334
+ Returns:
335
+ Random id string usable in sproc, table, and stage names.
336
+ """
337
+ return str(uuid4()).replace("-", "_").upper()
338
+
329
339
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
330
340
  """
331
341
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -404,7 +414,7 @@ class KernelPCA(BaseTransformer):
404
414
  cp.dump(self._sklearn_object, local_transform_file)
405
415
 
406
416
  # Create temp stage to run fit.
407
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
417
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
408
418
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
409
419
  SqlResultValidator(
410
420
  session=session,
@@ -417,11 +427,12 @@ class KernelPCA(BaseTransformer):
417
427
  expected_value=f"Stage area {transform_stage_name} successfully created."
418
428
  ).validate()
419
429
 
420
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
430
+ # Use posixpath to construct stage paths
431
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
432
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
421
433
  local_result_file_name = get_temp_file_path()
422
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
423
434
 
424
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
435
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
425
436
  statement_params = telemetry.get_function_usage_statement_params(
426
437
  project=_PROJECT,
427
438
  subproject=_SUBPROJECT,
@@ -447,6 +458,7 @@ class KernelPCA(BaseTransformer):
447
458
  replace=True,
448
459
  session=session,
449
460
  statement_params=statement_params,
461
+ anonymous=True
450
462
  )
451
463
  def fit_wrapper_sproc(
452
464
  session: Session,
@@ -455,7 +467,8 @@ class KernelPCA(BaseTransformer):
455
467
  stage_result_file_name: str,
456
468
  input_cols: List[str],
457
469
  label_cols: List[str],
458
- sample_weight_col: Optional[str]
470
+ sample_weight_col: Optional[str],
471
+ statement_params: Dict[str, str]
459
472
  ) -> str:
460
473
  import cloudpickle as cp
461
474
  import numpy as np
@@ -522,15 +535,15 @@ class KernelPCA(BaseTransformer):
522
535
  api_calls=[Session.call],
523
536
  custom_tags=dict([("autogen", True)]),
524
537
  )
525
- sproc_export_file_name = session.call(
526
- fit_sproc_name,
538
+ sproc_export_file_name = fit_wrapper_sproc(
539
+ session,
527
540
  query,
528
541
  stage_transform_file_name,
529
542
  stage_result_file_name,
530
543
  identifier.get_unescaped_names(self.input_cols),
531
544
  identifier.get_unescaped_names(self.label_cols),
532
545
  identifier.get_unescaped_names(self.sample_weight_col),
533
- statement_params=statement_params,
546
+ statement_params,
534
547
  )
535
548
 
536
549
  if "|" in sproc_export_file_name:
@@ -540,7 +553,7 @@ class KernelPCA(BaseTransformer):
540
553
  print("\n".join(fields[1:]))
541
554
 
542
555
  session.file.get(
543
- os.path.join(stage_result_file_name, sproc_export_file_name),
556
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
544
557
  local_result_file_name,
545
558
  statement_params=statement_params
546
559
  )
@@ -586,7 +599,7 @@ class KernelPCA(BaseTransformer):
586
599
 
587
600
  # Register vectorized UDF for batch inference
588
601
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
589
- safe_id=self.id, method=inference_method)
602
+ safe_id=self._get_rand_id(), method=inference_method)
590
603
 
591
604
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
592
605
  # will try to pickle all of self which fails.
@@ -678,7 +691,7 @@ class KernelPCA(BaseTransformer):
678
691
  return transformed_pandas_df.to_dict("records")
679
692
 
680
693
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
681
- safe_id=self.id
694
+ safe_id=self._get_rand_id()
682
695
  )
683
696
 
684
697
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -734,26 +747,37 @@ class KernelPCA(BaseTransformer):
734
747
  # input cols need to match unquoted / quoted
735
748
  input_cols = self.input_cols
736
749
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
750
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
737
751
 
738
752
  estimator = self._sklearn_object
739
753
 
740
- input_df = dataset[input_cols] # Select input columns with quoted column names.
741
- if hasattr(estimator, "feature_names_in_"):
742
- missing_features = []
743
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
744
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
745
- missing_features.append(f)
746
-
747
- if len(missing_features) > 0:
748
- raise ValueError(
749
- "The feature names should match with those that were passed during fit.\n"
750
- f"Features seen during fit call but not present in the input: {missing_features}\n"
751
- f"Features in the input dataframe : {input_cols}\n"
752
- )
753
- input_df.columns = getattr(estimator, "feature_names_in_")
754
- else:
755
- # Just rename the column names to unquoted identifiers.
756
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
754
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
755
+ missing_features = []
756
+ features_in_dataset = set(dataset.columns)
757
+ columns_to_select = []
758
+ for i, f in enumerate(features_required_by_estimator):
759
+ if (
760
+ i >= len(input_cols)
761
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
762
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
763
+ and quoted_input_cols[i] not in features_in_dataset)
764
+ ):
765
+ missing_features.append(f)
766
+ elif input_cols[i] in features_in_dataset:
767
+ columns_to_select.append(input_cols[i])
768
+ elif unquoted_input_cols[i] in features_in_dataset:
769
+ columns_to_select.append(unquoted_input_cols[i])
770
+ else:
771
+ columns_to_select.append(quoted_input_cols[i])
772
+
773
+ if len(missing_features) > 0:
774
+ raise ValueError(
775
+ "The feature names should match with those that were passed during fit.\n"
776
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
777
+ f"Features in the input dataframe : {input_cols}\n"
778
+ )
779
+ input_df = dataset[columns_to_select]
780
+ input_df.columns = features_required_by_estimator
757
781
 
758
782
  transformed_numpy_array = getattr(estimator, inference_method)(
759
783
  input_df
@@ -832,11 +856,18 @@ class KernelPCA(BaseTransformer):
832
856
  Transformed dataset.
833
857
  """
834
858
  if isinstance(dataset, DataFrame):
859
+ expected_type_inferred = ""
860
+ # when it is classifier, infer the datatype from label columns
861
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
862
+ expected_type_inferred = convert_sp_to_sf_type(
863
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
864
+ )
865
+
835
866
  output_df = self._batch_inference(
836
867
  dataset=dataset,
837
868
  inference_method="predict",
838
869
  expected_output_cols_list=self.output_cols,
839
- expected_output_cols_type="",
870
+ expected_output_cols_type=expected_type_inferred,
840
871
  )
841
872
  elif isinstance(dataset, pd.DataFrame):
842
873
  output_df = self._sklearn_inference(
@@ -909,10 +940,10 @@ class KernelPCA(BaseTransformer):
909
940
 
910
941
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
911
942
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
912
- Returns an empty list if current object is not a classifier or not yet fitted.
943
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
913
944
  """
914
945
  if getattr(self._sklearn_object, "classes_", None) is None:
915
- return []
946
+ return [output_cols_prefix]
916
947
 
917
948
  classes = self._sklearn_object.classes_
918
949
  if isinstance(classes, numpy.ndarray):
@@ -1137,7 +1168,7 @@ class KernelPCA(BaseTransformer):
1137
1168
  cp.dump(self._sklearn_object, local_score_file)
1138
1169
 
1139
1170
  # Create temp stage to run score.
1140
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1171
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1141
1172
  session = dataset._session
1142
1173
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1143
1174
  SqlResultValidator(
@@ -1151,8 +1182,9 @@ class KernelPCA(BaseTransformer):
1151
1182
  expected_value=f"Stage area {score_stage_name} successfully created."
1152
1183
  ).validate()
1153
1184
 
1154
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1155
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1185
+ # Use posixpath to construct stage paths
1186
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1187
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1156
1188
  statement_params = telemetry.get_function_usage_statement_params(
1157
1189
  project=_PROJECT,
1158
1190
  subproject=_SUBPROJECT,
@@ -1178,6 +1210,7 @@ class KernelPCA(BaseTransformer):
1178
1210
  replace=True,
1179
1211
  session=session,
1180
1212
  statement_params=statement_params,
1213
+ anonymous=True
1181
1214
  )
1182
1215
  def score_wrapper_sproc(
1183
1216
  session: Session,
@@ -1185,7 +1218,8 @@ class KernelPCA(BaseTransformer):
1185
1218
  stage_score_file_name: str,
1186
1219
  input_cols: List[str],
1187
1220
  label_cols: List[str],
1188
- sample_weight_col: Optional[str]
1221
+ sample_weight_col: Optional[str],
1222
+ statement_params: Dict[str, str]
1189
1223
  ) -> float:
1190
1224
  import cloudpickle as cp
1191
1225
  import numpy as np
@@ -1235,14 +1269,14 @@ class KernelPCA(BaseTransformer):
1235
1269
  api_calls=[Session.call],
1236
1270
  custom_tags=dict([("autogen", True)]),
1237
1271
  )
1238
- score = session.call(
1239
- score_sproc_name,
1272
+ score = score_wrapper_sproc(
1273
+ session,
1240
1274
  query,
1241
1275
  stage_score_file_name,
1242
1276
  identifier.get_unescaped_names(self.input_cols),
1243
1277
  identifier.get_unescaped_names(self.label_cols),
1244
1278
  identifier.get_unescaped_names(self.sample_weight_col),
1245
- statement_params=statement_params,
1279
+ statement_params,
1246
1280
  )
1247
1281
 
1248
1282
  cleanup_temp_files([local_score_file_name])
@@ -1260,18 +1294,20 @@ class KernelPCA(BaseTransformer):
1260
1294
  if self._sklearn_object._estimator_type == 'classifier':
1261
1295
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1262
1296
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1263
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1297
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1298
+ ([] if self._drop_input_cols else inputs) + outputs)
1264
1299
  # For regressor, the type of predict is float64
1265
1300
  elif self._sklearn_object._estimator_type == 'regressor':
1266
1301
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1267
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1268
-
1302
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1303
+ ([] if self._drop_input_cols else inputs) + outputs)
1269
1304
  for prob_func in PROB_FUNCTIONS:
1270
1305
  if hasattr(self, prob_func):
1271
1306
  output_cols_prefix: str = f"{prob_func}_"
1272
1307
  output_column_names = self._get_output_column_names(output_cols_prefix)
1273
1308
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1274
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1309
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1310
+ ([] if self._drop_input_cols else inputs) + outputs)
1275
1311
 
1276
1312
  @property
1277
1313
  def model_signatures(self) -> Dict[str, ModelSignature]: