snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -199,7 +201,6 @@ class NearestCentroid(BaseTransformer):
199
201
  sample_weight_col: Optional[str] = None,
200
202
  ) -> None:
201
203
  super().__init__()
202
- self.id = str(uuid4()).replace("-", "_").upper()
203
204
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
204
205
 
205
206
  self._deps = list(deps)
@@ -220,6 +221,15 @@ class NearestCentroid(BaseTransformer):
220
221
  self.set_drop_input_cols(drop_input_cols)
221
222
  self.set_sample_weight_col(sample_weight_col)
222
223
 
224
+ def _get_rand_id(self) -> str:
225
+ """
226
+ Generate random id to be used in sproc and stage names.
227
+
228
+ Returns:
229
+ Random id string usable in sproc, table, and stage names.
230
+ """
231
+ return str(uuid4()).replace("-", "_").upper()
232
+
223
233
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
224
234
  """
225
235
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -298,7 +308,7 @@ class NearestCentroid(BaseTransformer):
298
308
  cp.dump(self._sklearn_object, local_transform_file)
299
309
 
300
310
  # Create temp stage to run fit.
301
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
311
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
302
312
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
303
313
  SqlResultValidator(
304
314
  session=session,
@@ -311,11 +321,12 @@ class NearestCentroid(BaseTransformer):
311
321
  expected_value=f"Stage area {transform_stage_name} successfully created."
312
322
  ).validate()
313
323
 
314
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
324
+ # Use posixpath to construct stage paths
325
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
326
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
315
327
  local_result_file_name = get_temp_file_path()
316
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
317
328
 
318
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
329
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
319
330
  statement_params = telemetry.get_function_usage_statement_params(
320
331
  project=_PROJECT,
321
332
  subproject=_SUBPROJECT,
@@ -341,6 +352,7 @@ class NearestCentroid(BaseTransformer):
341
352
  replace=True,
342
353
  session=session,
343
354
  statement_params=statement_params,
355
+ anonymous=True
344
356
  )
345
357
  def fit_wrapper_sproc(
346
358
  session: Session,
@@ -349,7 +361,8 @@ class NearestCentroid(BaseTransformer):
349
361
  stage_result_file_name: str,
350
362
  input_cols: List[str],
351
363
  label_cols: List[str],
352
- sample_weight_col: Optional[str]
364
+ sample_weight_col: Optional[str],
365
+ statement_params: Dict[str, str]
353
366
  ) -> str:
354
367
  import cloudpickle as cp
355
368
  import numpy as np
@@ -416,15 +429,15 @@ class NearestCentroid(BaseTransformer):
416
429
  api_calls=[Session.call],
417
430
  custom_tags=dict([("autogen", True)]),
418
431
  )
419
- sproc_export_file_name = session.call(
420
- fit_sproc_name,
432
+ sproc_export_file_name = fit_wrapper_sproc(
433
+ session,
421
434
  query,
422
435
  stage_transform_file_name,
423
436
  stage_result_file_name,
424
437
  identifier.get_unescaped_names(self.input_cols),
425
438
  identifier.get_unescaped_names(self.label_cols),
426
439
  identifier.get_unescaped_names(self.sample_weight_col),
427
- statement_params=statement_params,
440
+ statement_params,
428
441
  )
429
442
 
430
443
  if "|" in sproc_export_file_name:
@@ -434,7 +447,7 @@ class NearestCentroid(BaseTransformer):
434
447
  print("\n".join(fields[1:]))
435
448
 
436
449
  session.file.get(
437
- os.path.join(stage_result_file_name, sproc_export_file_name),
450
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
438
451
  local_result_file_name,
439
452
  statement_params=statement_params
440
453
  )
@@ -480,7 +493,7 @@ class NearestCentroid(BaseTransformer):
480
493
 
481
494
  # Register vectorized UDF for batch inference
482
495
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
483
- safe_id=self.id, method=inference_method)
496
+ safe_id=self._get_rand_id(), method=inference_method)
484
497
 
485
498
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
486
499
  # will try to pickle all of self which fails.
@@ -572,7 +585,7 @@ class NearestCentroid(BaseTransformer):
572
585
  return transformed_pandas_df.to_dict("records")
573
586
 
574
587
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
575
- safe_id=self.id
588
+ safe_id=self._get_rand_id()
576
589
  )
577
590
 
578
591
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -628,26 +641,37 @@ class NearestCentroid(BaseTransformer):
628
641
  # input cols need to match unquoted / quoted
629
642
  input_cols = self.input_cols
630
643
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
644
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
631
645
 
632
646
  estimator = self._sklearn_object
633
647
 
634
- input_df = dataset[input_cols] # Select input columns with quoted column names.
635
- if hasattr(estimator, "feature_names_in_"):
636
- missing_features = []
637
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
638
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
639
- missing_features.append(f)
640
-
641
- if len(missing_features) > 0:
642
- raise ValueError(
643
- "The feature names should match with those that were passed during fit.\n"
644
- f"Features seen during fit call but not present in the input: {missing_features}\n"
645
- f"Features in the input dataframe : {input_cols}\n"
646
- )
647
- input_df.columns = getattr(estimator, "feature_names_in_")
648
- else:
649
- # Just rename the column names to unquoted identifiers.
650
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
648
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
649
+ missing_features = []
650
+ features_in_dataset = set(dataset.columns)
651
+ columns_to_select = []
652
+ for i, f in enumerate(features_required_by_estimator):
653
+ if (
654
+ i >= len(input_cols)
655
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
656
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
657
+ and quoted_input_cols[i] not in features_in_dataset)
658
+ ):
659
+ missing_features.append(f)
660
+ elif input_cols[i] in features_in_dataset:
661
+ columns_to_select.append(input_cols[i])
662
+ elif unquoted_input_cols[i] in features_in_dataset:
663
+ columns_to_select.append(unquoted_input_cols[i])
664
+ else:
665
+ columns_to_select.append(quoted_input_cols[i])
666
+
667
+ if len(missing_features) > 0:
668
+ raise ValueError(
669
+ "The feature names should match with those that were passed during fit.\n"
670
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
671
+ f"Features in the input dataframe : {input_cols}\n"
672
+ )
673
+ input_df = dataset[columns_to_select]
674
+ input_df.columns = features_required_by_estimator
651
675
 
652
676
  transformed_numpy_array = getattr(estimator, inference_method)(
653
677
  input_df
@@ -728,11 +752,18 @@ class NearestCentroid(BaseTransformer):
728
752
  Transformed dataset.
729
753
  """
730
754
  if isinstance(dataset, DataFrame):
755
+ expected_type_inferred = ""
756
+ # when it is classifier, infer the datatype from label columns
757
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
758
+ expected_type_inferred = convert_sp_to_sf_type(
759
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
760
+ )
761
+
731
762
  output_df = self._batch_inference(
732
763
  dataset=dataset,
733
764
  inference_method="predict",
734
765
  expected_output_cols_list=self.output_cols,
735
- expected_output_cols_type="",
766
+ expected_output_cols_type=expected_type_inferred,
736
767
  )
737
768
  elif isinstance(dataset, pd.DataFrame):
738
769
  output_df = self._sklearn_inference(
@@ -803,10 +834,10 @@ class NearestCentroid(BaseTransformer):
803
834
 
804
835
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
805
836
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
806
- Returns an empty list if current object is not a classifier or not yet fitted.
837
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
807
838
  """
808
839
  if getattr(self._sklearn_object, "classes_", None) is None:
809
- return []
840
+ return [output_cols_prefix]
810
841
 
811
842
  classes = self._sklearn_object.classes_
812
843
  if isinstance(classes, numpy.ndarray):
@@ -1031,7 +1062,7 @@ class NearestCentroid(BaseTransformer):
1031
1062
  cp.dump(self._sklearn_object, local_score_file)
1032
1063
 
1033
1064
  # Create temp stage to run score.
1034
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1065
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1035
1066
  session = dataset._session
1036
1067
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1037
1068
  SqlResultValidator(
@@ -1045,8 +1076,9 @@ class NearestCentroid(BaseTransformer):
1045
1076
  expected_value=f"Stage area {score_stage_name} successfully created."
1046
1077
  ).validate()
1047
1078
 
1048
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1049
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1079
+ # Use posixpath to construct stage paths
1080
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1081
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1050
1082
  statement_params = telemetry.get_function_usage_statement_params(
1051
1083
  project=_PROJECT,
1052
1084
  subproject=_SUBPROJECT,
@@ -1072,6 +1104,7 @@ class NearestCentroid(BaseTransformer):
1072
1104
  replace=True,
1073
1105
  session=session,
1074
1106
  statement_params=statement_params,
1107
+ anonymous=True
1075
1108
  )
1076
1109
  def score_wrapper_sproc(
1077
1110
  session: Session,
@@ -1079,7 +1112,8 @@ class NearestCentroid(BaseTransformer):
1079
1112
  stage_score_file_name: str,
1080
1113
  input_cols: List[str],
1081
1114
  label_cols: List[str],
1082
- sample_weight_col: Optional[str]
1115
+ sample_weight_col: Optional[str],
1116
+ statement_params: Dict[str, str]
1083
1117
  ) -> float:
1084
1118
  import cloudpickle as cp
1085
1119
  import numpy as np
@@ -1129,14 +1163,14 @@ class NearestCentroid(BaseTransformer):
1129
1163
  api_calls=[Session.call],
1130
1164
  custom_tags=dict([("autogen", True)]),
1131
1165
  )
1132
- score = session.call(
1133
- score_sproc_name,
1166
+ score = score_wrapper_sproc(
1167
+ session,
1134
1168
  query,
1135
1169
  stage_score_file_name,
1136
1170
  identifier.get_unescaped_names(self.input_cols),
1137
1171
  identifier.get_unescaped_names(self.label_cols),
1138
1172
  identifier.get_unescaped_names(self.sample_weight_col),
1139
- statement_params=statement_params,
1173
+ statement_params,
1140
1174
  )
1141
1175
 
1142
1176
  cleanup_temp_files([local_score_file_name])
@@ -1154,18 +1188,20 @@ class NearestCentroid(BaseTransformer):
1154
1188
  if self._sklearn_object._estimator_type == 'classifier':
1155
1189
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1156
1190
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1157
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1191
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1192
+ ([] if self._drop_input_cols else inputs) + outputs)
1158
1193
  # For regressor, the type of predict is float64
1159
1194
  elif self._sklearn_object._estimator_type == 'regressor':
1160
1195
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1161
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1162
-
1196
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1197
+ ([] if self._drop_input_cols else inputs) + outputs)
1163
1198
  for prob_func in PROB_FUNCTIONS:
1164
1199
  if hasattr(self, prob_func):
1165
1200
  output_cols_prefix: str = f"{prob_func}_"
1166
1201
  output_column_names = self._get_output_column_names(output_cols_prefix)
1167
1202
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1168
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1203
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1204
+ ([] if self._drop_input_cols else inputs) + outputs)
1169
1205
 
1170
1206
  @property
1171
1207
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -245,7 +247,6 @@ class NearestNeighbors(BaseTransformer):
245
247
  sample_weight_col: Optional[str] = None,
246
248
  ) -> None:
247
249
  super().__init__()
248
- self.id = str(uuid4()).replace("-", "_").upper()
249
250
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
250
251
 
251
252
  self._deps = list(deps)
@@ -272,6 +273,15 @@ class NearestNeighbors(BaseTransformer):
272
273
  self.set_drop_input_cols(drop_input_cols)
273
274
  self.set_sample_weight_col(sample_weight_col)
274
275
 
276
+ def _get_rand_id(self) -> str:
277
+ """
278
+ Generate random id to be used in sproc and stage names.
279
+
280
+ Returns:
281
+ Random id string usable in sproc, table, and stage names.
282
+ """
283
+ return str(uuid4()).replace("-", "_").upper()
284
+
275
285
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
276
286
  """
277
287
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -350,7 +360,7 @@ class NearestNeighbors(BaseTransformer):
350
360
  cp.dump(self._sklearn_object, local_transform_file)
351
361
 
352
362
  # Create temp stage to run fit.
353
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
363
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
354
364
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
355
365
  SqlResultValidator(
356
366
  session=session,
@@ -363,11 +373,12 @@ class NearestNeighbors(BaseTransformer):
363
373
  expected_value=f"Stage area {transform_stage_name} successfully created."
364
374
  ).validate()
365
375
 
366
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
376
+ # Use posixpath to construct stage paths
377
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
378
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
367
379
  local_result_file_name = get_temp_file_path()
368
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
369
380
 
370
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
381
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
371
382
  statement_params = telemetry.get_function_usage_statement_params(
372
383
  project=_PROJECT,
373
384
  subproject=_SUBPROJECT,
@@ -393,6 +404,7 @@ class NearestNeighbors(BaseTransformer):
393
404
  replace=True,
394
405
  session=session,
395
406
  statement_params=statement_params,
407
+ anonymous=True
396
408
  )
397
409
  def fit_wrapper_sproc(
398
410
  session: Session,
@@ -401,7 +413,8 @@ class NearestNeighbors(BaseTransformer):
401
413
  stage_result_file_name: str,
402
414
  input_cols: List[str],
403
415
  label_cols: List[str],
404
- sample_weight_col: Optional[str]
416
+ sample_weight_col: Optional[str],
417
+ statement_params: Dict[str, str]
405
418
  ) -> str:
406
419
  import cloudpickle as cp
407
420
  import numpy as np
@@ -468,15 +481,15 @@ class NearestNeighbors(BaseTransformer):
468
481
  api_calls=[Session.call],
469
482
  custom_tags=dict([("autogen", True)]),
470
483
  )
471
- sproc_export_file_name = session.call(
472
- fit_sproc_name,
484
+ sproc_export_file_name = fit_wrapper_sproc(
485
+ session,
473
486
  query,
474
487
  stage_transform_file_name,
475
488
  stage_result_file_name,
476
489
  identifier.get_unescaped_names(self.input_cols),
477
490
  identifier.get_unescaped_names(self.label_cols),
478
491
  identifier.get_unescaped_names(self.sample_weight_col),
479
- statement_params=statement_params,
492
+ statement_params,
480
493
  )
481
494
 
482
495
  if "|" in sproc_export_file_name:
@@ -486,7 +499,7 @@ class NearestNeighbors(BaseTransformer):
486
499
  print("\n".join(fields[1:]))
487
500
 
488
501
  session.file.get(
489
- os.path.join(stage_result_file_name, sproc_export_file_name),
502
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
490
503
  local_result_file_name,
491
504
  statement_params=statement_params
492
505
  )
@@ -532,7 +545,7 @@ class NearestNeighbors(BaseTransformer):
532
545
 
533
546
  # Register vectorized UDF for batch inference
534
547
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
535
- safe_id=self.id, method=inference_method)
548
+ safe_id=self._get_rand_id(), method=inference_method)
536
549
 
537
550
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
538
551
  # will try to pickle all of self which fails.
@@ -624,7 +637,7 @@ class NearestNeighbors(BaseTransformer):
624
637
  return transformed_pandas_df.to_dict("records")
625
638
 
626
639
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
627
- safe_id=self.id
640
+ safe_id=self._get_rand_id()
628
641
  )
629
642
 
630
643
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -680,26 +693,37 @@ class NearestNeighbors(BaseTransformer):
680
693
  # input cols need to match unquoted / quoted
681
694
  input_cols = self.input_cols
682
695
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
696
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
683
697
 
684
698
  estimator = self._sklearn_object
685
699
 
686
- input_df = dataset[input_cols] # Select input columns with quoted column names.
687
- if hasattr(estimator, "feature_names_in_"):
688
- missing_features = []
689
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
690
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
691
- missing_features.append(f)
692
-
693
- if len(missing_features) > 0:
694
- raise ValueError(
695
- "The feature names should match with those that were passed during fit.\n"
696
- f"Features seen during fit call but not present in the input: {missing_features}\n"
697
- f"Features in the input dataframe : {input_cols}\n"
698
- )
699
- input_df.columns = getattr(estimator, "feature_names_in_")
700
- else:
701
- # Just rename the column names to unquoted identifiers.
702
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
700
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
701
+ missing_features = []
702
+ features_in_dataset = set(dataset.columns)
703
+ columns_to_select = []
704
+ for i, f in enumerate(features_required_by_estimator):
705
+ if (
706
+ i >= len(input_cols)
707
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
708
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
709
+ and quoted_input_cols[i] not in features_in_dataset)
710
+ ):
711
+ missing_features.append(f)
712
+ elif input_cols[i] in features_in_dataset:
713
+ columns_to_select.append(input_cols[i])
714
+ elif unquoted_input_cols[i] in features_in_dataset:
715
+ columns_to_select.append(unquoted_input_cols[i])
716
+ else:
717
+ columns_to_select.append(quoted_input_cols[i])
718
+
719
+ if len(missing_features) > 0:
720
+ raise ValueError(
721
+ "The feature names should match with those that were passed during fit.\n"
722
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
723
+ f"Features in the input dataframe : {input_cols}\n"
724
+ )
725
+ input_df = dataset[columns_to_select]
726
+ input_df.columns = features_required_by_estimator
703
727
 
704
728
  transformed_numpy_array = getattr(estimator, inference_method)(
705
729
  input_df
@@ -778,11 +802,18 @@ class NearestNeighbors(BaseTransformer):
778
802
  Transformed dataset.
779
803
  """
780
804
  if isinstance(dataset, DataFrame):
805
+ expected_type_inferred = ""
806
+ # when it is classifier, infer the datatype from label columns
807
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
808
+ expected_type_inferred = convert_sp_to_sf_type(
809
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
810
+ )
811
+
781
812
  output_df = self._batch_inference(
782
813
  dataset=dataset,
783
814
  inference_method="predict",
784
815
  expected_output_cols_list=self.output_cols,
785
- expected_output_cols_type="",
816
+ expected_output_cols_type=expected_type_inferred,
786
817
  )
787
818
  elif isinstance(dataset, pd.DataFrame):
788
819
  output_df = self._sklearn_inference(
@@ -853,10 +884,10 @@ class NearestNeighbors(BaseTransformer):
853
884
 
854
885
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
855
886
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
856
- Returns an empty list if current object is not a classifier or not yet fitted.
887
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
857
888
  """
858
889
  if getattr(self._sklearn_object, "classes_", None) is None:
859
- return []
890
+ return [output_cols_prefix]
860
891
 
861
892
  classes = self._sklearn_object.classes_
862
893
  if isinstance(classes, numpy.ndarray):
@@ -1081,7 +1112,7 @@ class NearestNeighbors(BaseTransformer):
1081
1112
  cp.dump(self._sklearn_object, local_score_file)
1082
1113
 
1083
1114
  # Create temp stage to run score.
1084
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1115
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1085
1116
  session = dataset._session
1086
1117
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1087
1118
  SqlResultValidator(
@@ -1095,8 +1126,9 @@ class NearestNeighbors(BaseTransformer):
1095
1126
  expected_value=f"Stage area {score_stage_name} successfully created."
1096
1127
  ).validate()
1097
1128
 
1098
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1099
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1129
+ # Use posixpath to construct stage paths
1130
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1131
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1100
1132
  statement_params = telemetry.get_function_usage_statement_params(
1101
1133
  project=_PROJECT,
1102
1134
  subproject=_SUBPROJECT,
@@ -1122,6 +1154,7 @@ class NearestNeighbors(BaseTransformer):
1122
1154
  replace=True,
1123
1155
  session=session,
1124
1156
  statement_params=statement_params,
1157
+ anonymous=True
1125
1158
  )
1126
1159
  def score_wrapper_sproc(
1127
1160
  session: Session,
@@ -1129,7 +1162,8 @@ class NearestNeighbors(BaseTransformer):
1129
1162
  stage_score_file_name: str,
1130
1163
  input_cols: List[str],
1131
1164
  label_cols: List[str],
1132
- sample_weight_col: Optional[str]
1165
+ sample_weight_col: Optional[str],
1166
+ statement_params: Dict[str, str]
1133
1167
  ) -> float:
1134
1168
  import cloudpickle as cp
1135
1169
  import numpy as np
@@ -1179,14 +1213,14 @@ class NearestNeighbors(BaseTransformer):
1179
1213
  api_calls=[Session.call],
1180
1214
  custom_tags=dict([("autogen", True)]),
1181
1215
  )
1182
- score = session.call(
1183
- score_sproc_name,
1216
+ score = score_wrapper_sproc(
1217
+ session,
1184
1218
  query,
1185
1219
  stage_score_file_name,
1186
1220
  identifier.get_unescaped_names(self.input_cols),
1187
1221
  identifier.get_unescaped_names(self.label_cols),
1188
1222
  identifier.get_unescaped_names(self.sample_weight_col),
1189
- statement_params=statement_params,
1223
+ statement_params,
1190
1224
  )
1191
1225
 
1192
1226
  cleanup_temp_files([local_score_file_name])
@@ -1204,18 +1238,20 @@ class NearestNeighbors(BaseTransformer):
1204
1238
  if self._sklearn_object._estimator_type == 'classifier':
1205
1239
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1206
1240
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1207
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1241
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1242
+ ([] if self._drop_input_cols else inputs) + outputs)
1208
1243
  # For regressor, the type of predict is float64
1209
1244
  elif self._sklearn_object._estimator_type == 'regressor':
1210
1245
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1211
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1212
-
1246
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1247
+ ([] if self._drop_input_cols else inputs) + outputs)
1213
1248
  for prob_func in PROB_FUNCTIONS:
1214
1249
  if hasattr(self, prob_func):
1215
1250
  output_cols_prefix: str = f"{prob_func}_"
1216
1251
  output_column_names = self._get_output_column_names(output_cols_prefix)
1217
1252
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1218
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1253
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1254
+ ([] if self._drop_input_cols else inputs) + outputs)
1219
1255
 
1220
1256
  @property
1221
1257
  def model_signatures(self) -> Dict[str, ModelSignature]: