snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -253,7 +255,6 @@ class KNeighborsClassifier(BaseTransformer):
253
255
  sample_weight_col: Optional[str] = None,
254
256
  ) -> None:
255
257
  super().__init__()
256
- self.id = str(uuid4()).replace("-", "_").upper()
257
258
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
258
259
 
259
260
  self._deps = list(deps)
@@ -280,6 +281,15 @@ class KNeighborsClassifier(BaseTransformer):
280
281
  self.set_drop_input_cols(drop_input_cols)
281
282
  self.set_sample_weight_col(sample_weight_col)
282
283
 
284
+ def _get_rand_id(self) -> str:
285
+ """
286
+ Generate random id to be used in sproc and stage names.
287
+
288
+ Returns:
289
+ Random id string usable in sproc, table, and stage names.
290
+ """
291
+ return str(uuid4()).replace("-", "_").upper()
292
+
283
293
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
284
294
  """
285
295
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -358,7 +368,7 @@ class KNeighborsClassifier(BaseTransformer):
358
368
  cp.dump(self._sklearn_object, local_transform_file)
359
369
 
360
370
  # Create temp stage to run fit.
361
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
371
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
362
372
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
363
373
  SqlResultValidator(
364
374
  session=session,
@@ -371,11 +381,12 @@ class KNeighborsClassifier(BaseTransformer):
371
381
  expected_value=f"Stage area {transform_stage_name} successfully created."
372
382
  ).validate()
373
383
 
374
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
384
+ # Use posixpath to construct stage paths
385
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
386
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
375
387
  local_result_file_name = get_temp_file_path()
376
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
377
388
 
378
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
389
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
379
390
  statement_params = telemetry.get_function_usage_statement_params(
380
391
  project=_PROJECT,
381
392
  subproject=_SUBPROJECT,
@@ -401,6 +412,7 @@ class KNeighborsClassifier(BaseTransformer):
401
412
  replace=True,
402
413
  session=session,
403
414
  statement_params=statement_params,
415
+ anonymous=True
404
416
  )
405
417
  def fit_wrapper_sproc(
406
418
  session: Session,
@@ -409,7 +421,8 @@ class KNeighborsClassifier(BaseTransformer):
409
421
  stage_result_file_name: str,
410
422
  input_cols: List[str],
411
423
  label_cols: List[str],
412
- sample_weight_col: Optional[str]
424
+ sample_weight_col: Optional[str],
425
+ statement_params: Dict[str, str]
413
426
  ) -> str:
414
427
  import cloudpickle as cp
415
428
  import numpy as np
@@ -476,15 +489,15 @@ class KNeighborsClassifier(BaseTransformer):
476
489
  api_calls=[Session.call],
477
490
  custom_tags=dict([("autogen", True)]),
478
491
  )
479
- sproc_export_file_name = session.call(
480
- fit_sproc_name,
492
+ sproc_export_file_name = fit_wrapper_sproc(
493
+ session,
481
494
  query,
482
495
  stage_transform_file_name,
483
496
  stage_result_file_name,
484
497
  identifier.get_unescaped_names(self.input_cols),
485
498
  identifier.get_unescaped_names(self.label_cols),
486
499
  identifier.get_unescaped_names(self.sample_weight_col),
487
- statement_params=statement_params,
500
+ statement_params,
488
501
  )
489
502
 
490
503
  if "|" in sproc_export_file_name:
@@ -494,7 +507,7 @@ class KNeighborsClassifier(BaseTransformer):
494
507
  print("\n".join(fields[1:]))
495
508
 
496
509
  session.file.get(
497
- os.path.join(stage_result_file_name, sproc_export_file_name),
510
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
498
511
  local_result_file_name,
499
512
  statement_params=statement_params
500
513
  )
@@ -540,7 +553,7 @@ class KNeighborsClassifier(BaseTransformer):
540
553
 
541
554
  # Register vectorized UDF for batch inference
542
555
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
543
- safe_id=self.id, method=inference_method)
556
+ safe_id=self._get_rand_id(), method=inference_method)
544
557
 
545
558
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
546
559
  # will try to pickle all of self which fails.
@@ -632,7 +645,7 @@ class KNeighborsClassifier(BaseTransformer):
632
645
  return transformed_pandas_df.to_dict("records")
633
646
 
634
647
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
635
- safe_id=self.id
648
+ safe_id=self._get_rand_id()
636
649
  )
637
650
 
638
651
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -688,26 +701,37 @@ class KNeighborsClassifier(BaseTransformer):
688
701
  # input cols need to match unquoted / quoted
689
702
  input_cols = self.input_cols
690
703
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
704
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
691
705
 
692
706
  estimator = self._sklearn_object
693
707
 
694
- input_df = dataset[input_cols] # Select input columns with quoted column names.
695
- if hasattr(estimator, "feature_names_in_"):
696
- missing_features = []
697
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
698
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
699
- missing_features.append(f)
700
-
701
- if len(missing_features) > 0:
702
- raise ValueError(
703
- "The feature names should match with those that were passed during fit.\n"
704
- f"Features seen during fit call but not present in the input: {missing_features}\n"
705
- f"Features in the input dataframe : {input_cols}\n"
706
- )
707
- input_df.columns = getattr(estimator, "feature_names_in_")
708
- else:
709
- # Just rename the column names to unquoted identifiers.
710
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
708
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
709
+ missing_features = []
710
+ features_in_dataset = set(dataset.columns)
711
+ columns_to_select = []
712
+ for i, f in enumerate(features_required_by_estimator):
713
+ if (
714
+ i >= len(input_cols)
715
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
716
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
717
+ and quoted_input_cols[i] not in features_in_dataset)
718
+ ):
719
+ missing_features.append(f)
720
+ elif input_cols[i] in features_in_dataset:
721
+ columns_to_select.append(input_cols[i])
722
+ elif unquoted_input_cols[i] in features_in_dataset:
723
+ columns_to_select.append(unquoted_input_cols[i])
724
+ else:
725
+ columns_to_select.append(quoted_input_cols[i])
726
+
727
+ if len(missing_features) > 0:
728
+ raise ValueError(
729
+ "The feature names should match with those that were passed during fit.\n"
730
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
731
+ f"Features in the input dataframe : {input_cols}\n"
732
+ )
733
+ input_df = dataset[columns_to_select]
734
+ input_df.columns = features_required_by_estimator
711
735
 
712
736
  transformed_numpy_array = getattr(estimator, inference_method)(
713
737
  input_df
@@ -788,11 +812,18 @@ class KNeighborsClassifier(BaseTransformer):
788
812
  Transformed dataset.
789
813
  """
790
814
  if isinstance(dataset, DataFrame):
815
+ expected_type_inferred = ""
816
+ # when it is classifier, infer the datatype from label columns
817
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
818
+ expected_type_inferred = convert_sp_to_sf_type(
819
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
820
+ )
821
+
791
822
  output_df = self._batch_inference(
792
823
  dataset=dataset,
793
824
  inference_method="predict",
794
825
  expected_output_cols_list=self.output_cols,
795
- expected_output_cols_type="",
826
+ expected_output_cols_type=expected_type_inferred,
796
827
  )
797
828
  elif isinstance(dataset, pd.DataFrame):
798
829
  output_df = self._sklearn_inference(
@@ -863,10 +894,10 @@ class KNeighborsClassifier(BaseTransformer):
863
894
 
864
895
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
865
896
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
866
- Returns an empty list if current object is not a classifier or not yet fitted.
897
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
867
898
  """
868
899
  if getattr(self._sklearn_object, "classes_", None) is None:
869
- return []
900
+ return [output_cols_prefix]
870
901
 
871
902
  classes = self._sklearn_object.classes_
872
903
  if isinstance(classes, numpy.ndarray):
@@ -1095,7 +1126,7 @@ class KNeighborsClassifier(BaseTransformer):
1095
1126
  cp.dump(self._sklearn_object, local_score_file)
1096
1127
 
1097
1128
  # Create temp stage to run score.
1098
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1129
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1099
1130
  session = dataset._session
1100
1131
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1101
1132
  SqlResultValidator(
@@ -1109,8 +1140,9 @@ class KNeighborsClassifier(BaseTransformer):
1109
1140
  expected_value=f"Stage area {score_stage_name} successfully created."
1110
1141
  ).validate()
1111
1142
 
1112
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1113
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1143
+ # Use posixpath to construct stage paths
1144
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1145
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1114
1146
  statement_params = telemetry.get_function_usage_statement_params(
1115
1147
  project=_PROJECT,
1116
1148
  subproject=_SUBPROJECT,
@@ -1136,6 +1168,7 @@ class KNeighborsClassifier(BaseTransformer):
1136
1168
  replace=True,
1137
1169
  session=session,
1138
1170
  statement_params=statement_params,
1171
+ anonymous=True
1139
1172
  )
1140
1173
  def score_wrapper_sproc(
1141
1174
  session: Session,
@@ -1143,7 +1176,8 @@ class KNeighborsClassifier(BaseTransformer):
1143
1176
  stage_score_file_name: str,
1144
1177
  input_cols: List[str],
1145
1178
  label_cols: List[str],
1146
- sample_weight_col: Optional[str]
1179
+ sample_weight_col: Optional[str],
1180
+ statement_params: Dict[str, str]
1147
1181
  ) -> float:
1148
1182
  import cloudpickle as cp
1149
1183
  import numpy as np
@@ -1193,14 +1227,14 @@ class KNeighborsClassifier(BaseTransformer):
1193
1227
  api_calls=[Session.call],
1194
1228
  custom_tags=dict([("autogen", True)]),
1195
1229
  )
1196
- score = session.call(
1197
- score_sproc_name,
1230
+ score = score_wrapper_sproc(
1231
+ session,
1198
1232
  query,
1199
1233
  stage_score_file_name,
1200
1234
  identifier.get_unescaped_names(self.input_cols),
1201
1235
  identifier.get_unescaped_names(self.label_cols),
1202
1236
  identifier.get_unescaped_names(self.sample_weight_col),
1203
- statement_params=statement_params,
1237
+ statement_params,
1204
1238
  )
1205
1239
 
1206
1240
  cleanup_temp_files([local_score_file_name])
@@ -1218,18 +1252,20 @@ class KNeighborsClassifier(BaseTransformer):
1218
1252
  if self._sklearn_object._estimator_type == 'classifier':
1219
1253
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1220
1254
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1221
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1255
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1256
+ ([] if self._drop_input_cols else inputs) + outputs)
1222
1257
  # For regressor, the type of predict is float64
1223
1258
  elif self._sklearn_object._estimator_type == 'regressor':
1224
1259
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1225
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1226
-
1260
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1261
+ ([] if self._drop_input_cols else inputs) + outputs)
1227
1262
  for prob_func in PROB_FUNCTIONS:
1228
1263
  if hasattr(self, prob_func):
1229
1264
  output_cols_prefix: str = f"{prob_func}_"
1230
1265
  output_column_names = self._get_output_column_names(output_cols_prefix)
1231
1266
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1232
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1267
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1268
+ ([] if self._drop_input_cols else inputs) + outputs)
1233
1269
 
1234
1270
  @property
1235
1271
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -255,7 +257,6 @@ class KNeighborsRegressor(BaseTransformer):
255
257
  sample_weight_col: Optional[str] = None,
256
258
  ) -> None:
257
259
  super().__init__()
258
- self.id = str(uuid4()).replace("-", "_").upper()
259
260
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
260
261
 
261
262
  self._deps = list(deps)
@@ -282,6 +283,15 @@ class KNeighborsRegressor(BaseTransformer):
282
283
  self.set_drop_input_cols(drop_input_cols)
283
284
  self.set_sample_weight_col(sample_weight_col)
284
285
 
286
+ def _get_rand_id(self) -> str:
287
+ """
288
+ Generate random id to be used in sproc and stage names.
289
+
290
+ Returns:
291
+ Random id string usable in sproc, table, and stage names.
292
+ """
293
+ return str(uuid4()).replace("-", "_").upper()
294
+
285
295
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
286
296
  """
287
297
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -360,7 +370,7 @@ class KNeighborsRegressor(BaseTransformer):
360
370
  cp.dump(self._sklearn_object, local_transform_file)
361
371
 
362
372
  # Create temp stage to run fit.
363
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
373
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
364
374
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
365
375
  SqlResultValidator(
366
376
  session=session,
@@ -373,11 +383,12 @@ class KNeighborsRegressor(BaseTransformer):
373
383
  expected_value=f"Stage area {transform_stage_name} successfully created."
374
384
  ).validate()
375
385
 
376
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
386
+ # Use posixpath to construct stage paths
387
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
388
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
377
389
  local_result_file_name = get_temp_file_path()
378
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
379
390
 
380
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
391
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
381
392
  statement_params = telemetry.get_function_usage_statement_params(
382
393
  project=_PROJECT,
383
394
  subproject=_SUBPROJECT,
@@ -403,6 +414,7 @@ class KNeighborsRegressor(BaseTransformer):
403
414
  replace=True,
404
415
  session=session,
405
416
  statement_params=statement_params,
417
+ anonymous=True
406
418
  )
407
419
  def fit_wrapper_sproc(
408
420
  session: Session,
@@ -411,7 +423,8 @@ class KNeighborsRegressor(BaseTransformer):
411
423
  stage_result_file_name: str,
412
424
  input_cols: List[str],
413
425
  label_cols: List[str],
414
- sample_weight_col: Optional[str]
426
+ sample_weight_col: Optional[str],
427
+ statement_params: Dict[str, str]
415
428
  ) -> str:
416
429
  import cloudpickle as cp
417
430
  import numpy as np
@@ -478,15 +491,15 @@ class KNeighborsRegressor(BaseTransformer):
478
491
  api_calls=[Session.call],
479
492
  custom_tags=dict([("autogen", True)]),
480
493
  )
481
- sproc_export_file_name = session.call(
482
- fit_sproc_name,
494
+ sproc_export_file_name = fit_wrapper_sproc(
495
+ session,
483
496
  query,
484
497
  stage_transform_file_name,
485
498
  stage_result_file_name,
486
499
  identifier.get_unescaped_names(self.input_cols),
487
500
  identifier.get_unescaped_names(self.label_cols),
488
501
  identifier.get_unescaped_names(self.sample_weight_col),
489
- statement_params=statement_params,
502
+ statement_params,
490
503
  )
491
504
 
492
505
  if "|" in sproc_export_file_name:
@@ -496,7 +509,7 @@ class KNeighborsRegressor(BaseTransformer):
496
509
  print("\n".join(fields[1:]))
497
510
 
498
511
  session.file.get(
499
- os.path.join(stage_result_file_name, sproc_export_file_name),
512
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
500
513
  local_result_file_name,
501
514
  statement_params=statement_params
502
515
  )
@@ -542,7 +555,7 @@ class KNeighborsRegressor(BaseTransformer):
542
555
 
543
556
  # Register vectorized UDF for batch inference
544
557
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
545
- safe_id=self.id, method=inference_method)
558
+ safe_id=self._get_rand_id(), method=inference_method)
546
559
 
547
560
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
548
561
  # will try to pickle all of self which fails.
@@ -634,7 +647,7 @@ class KNeighborsRegressor(BaseTransformer):
634
647
  return transformed_pandas_df.to_dict("records")
635
648
 
636
649
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
637
- safe_id=self.id
650
+ safe_id=self._get_rand_id()
638
651
  )
639
652
 
640
653
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -690,26 +703,37 @@ class KNeighborsRegressor(BaseTransformer):
690
703
  # input cols need to match unquoted / quoted
691
704
  input_cols = self.input_cols
692
705
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
706
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
693
707
 
694
708
  estimator = self._sklearn_object
695
709
 
696
- input_df = dataset[input_cols] # Select input columns with quoted column names.
697
- if hasattr(estimator, "feature_names_in_"):
698
- missing_features = []
699
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
700
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
701
- missing_features.append(f)
702
-
703
- if len(missing_features) > 0:
704
- raise ValueError(
705
- "The feature names should match with those that were passed during fit.\n"
706
- f"Features seen during fit call but not present in the input: {missing_features}\n"
707
- f"Features in the input dataframe : {input_cols}\n"
708
- )
709
- input_df.columns = getattr(estimator, "feature_names_in_")
710
- else:
711
- # Just rename the column names to unquoted identifiers.
712
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
710
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
711
+ missing_features = []
712
+ features_in_dataset = set(dataset.columns)
713
+ columns_to_select = []
714
+ for i, f in enumerate(features_required_by_estimator):
715
+ if (
716
+ i >= len(input_cols)
717
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
718
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
719
+ and quoted_input_cols[i] not in features_in_dataset)
720
+ ):
721
+ missing_features.append(f)
722
+ elif input_cols[i] in features_in_dataset:
723
+ columns_to_select.append(input_cols[i])
724
+ elif unquoted_input_cols[i] in features_in_dataset:
725
+ columns_to_select.append(unquoted_input_cols[i])
726
+ else:
727
+ columns_to_select.append(quoted_input_cols[i])
728
+
729
+ if len(missing_features) > 0:
730
+ raise ValueError(
731
+ "The feature names should match with those that were passed during fit.\n"
732
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
733
+ f"Features in the input dataframe : {input_cols}\n"
734
+ )
735
+ input_df = dataset[columns_to_select]
736
+ input_df.columns = features_required_by_estimator
713
737
 
714
738
  transformed_numpy_array = getattr(estimator, inference_method)(
715
739
  input_df
@@ -790,11 +814,18 @@ class KNeighborsRegressor(BaseTransformer):
790
814
  Transformed dataset.
791
815
  """
792
816
  if isinstance(dataset, DataFrame):
817
+ expected_type_inferred = "float"
818
+ # when it is classifier, infer the datatype from label columns
819
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
820
+ expected_type_inferred = convert_sp_to_sf_type(
821
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
822
+ )
823
+
793
824
  output_df = self._batch_inference(
794
825
  dataset=dataset,
795
826
  inference_method="predict",
796
827
  expected_output_cols_list=self.output_cols,
797
- expected_output_cols_type="float",
828
+ expected_output_cols_type=expected_type_inferred,
798
829
  )
799
830
  elif isinstance(dataset, pd.DataFrame):
800
831
  output_df = self._sklearn_inference(
@@ -865,10 +896,10 @@ class KNeighborsRegressor(BaseTransformer):
865
896
 
866
897
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
867
898
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
868
- Returns an empty list if current object is not a classifier or not yet fitted.
899
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
869
900
  """
870
901
  if getattr(self._sklearn_object, "classes_", None) is None:
871
- return []
902
+ return [output_cols_prefix]
872
903
 
873
904
  classes = self._sklearn_object.classes_
874
905
  if isinstance(classes, numpy.ndarray):
@@ -1093,7 +1124,7 @@ class KNeighborsRegressor(BaseTransformer):
1093
1124
  cp.dump(self._sklearn_object, local_score_file)
1094
1125
 
1095
1126
  # Create temp stage to run score.
1096
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1127
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1097
1128
  session = dataset._session
1098
1129
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1099
1130
  SqlResultValidator(
@@ -1107,8 +1138,9 @@ class KNeighborsRegressor(BaseTransformer):
1107
1138
  expected_value=f"Stage area {score_stage_name} successfully created."
1108
1139
  ).validate()
1109
1140
 
1110
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1111
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1141
+ # Use posixpath to construct stage paths
1142
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1143
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1112
1144
  statement_params = telemetry.get_function_usage_statement_params(
1113
1145
  project=_PROJECT,
1114
1146
  subproject=_SUBPROJECT,
@@ -1134,6 +1166,7 @@ class KNeighborsRegressor(BaseTransformer):
1134
1166
  replace=True,
1135
1167
  session=session,
1136
1168
  statement_params=statement_params,
1169
+ anonymous=True
1137
1170
  )
1138
1171
  def score_wrapper_sproc(
1139
1172
  session: Session,
@@ -1141,7 +1174,8 @@ class KNeighborsRegressor(BaseTransformer):
1141
1174
  stage_score_file_name: str,
1142
1175
  input_cols: List[str],
1143
1176
  label_cols: List[str],
1144
- sample_weight_col: Optional[str]
1177
+ sample_weight_col: Optional[str],
1178
+ statement_params: Dict[str, str]
1145
1179
  ) -> float:
1146
1180
  import cloudpickle as cp
1147
1181
  import numpy as np
@@ -1191,14 +1225,14 @@ class KNeighborsRegressor(BaseTransformer):
1191
1225
  api_calls=[Session.call],
1192
1226
  custom_tags=dict([("autogen", True)]),
1193
1227
  )
1194
- score = session.call(
1195
- score_sproc_name,
1228
+ score = score_wrapper_sproc(
1229
+ session,
1196
1230
  query,
1197
1231
  stage_score_file_name,
1198
1232
  identifier.get_unescaped_names(self.input_cols),
1199
1233
  identifier.get_unescaped_names(self.label_cols),
1200
1234
  identifier.get_unescaped_names(self.sample_weight_col),
1201
- statement_params=statement_params,
1235
+ statement_params,
1202
1236
  )
1203
1237
 
1204
1238
  cleanup_temp_files([local_score_file_name])
@@ -1216,18 +1250,20 @@ class KNeighborsRegressor(BaseTransformer):
1216
1250
  if self._sklearn_object._estimator_type == 'classifier':
1217
1251
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1218
1252
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1219
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1253
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1254
+ ([] if self._drop_input_cols else inputs) + outputs)
1220
1255
  # For regressor, the type of predict is float64
1221
1256
  elif self._sklearn_object._estimator_type == 'regressor':
1222
1257
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1223
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1224
-
1258
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1259
+ ([] if self._drop_input_cols else inputs) + outputs)
1225
1260
  for prob_func in PROB_FUNCTIONS:
1226
1261
  if hasattr(self, prob_func):
1227
1262
  output_cols_prefix: str = f"{prob_func}_"
1228
1263
  output_column_names = self._get_output_column_names(output_cols_prefix)
1229
1264
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1230
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1265
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1266
+ ([] if self._drop_input_cols else inputs) + outputs)
1231
1267
 
1232
1268
  @property
1233
1269
  def model_signatures(self) -> Dict[str, ModelSignature]: