snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -233,7 +235,6 @@ class KNNImputer(BaseTransformer):
233
235
  sample_weight_col: Optional[str] = None,
234
236
  ) -> None:
235
237
  super().__init__()
236
- self.id = str(uuid4()).replace("-", "_").upper()
237
238
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
238
239
 
239
240
  self._deps = list(deps)
@@ -259,6 +260,15 @@ class KNNImputer(BaseTransformer):
259
260
  self.set_drop_input_cols(drop_input_cols)
260
261
  self.set_sample_weight_col(sample_weight_col)
261
262
 
263
+ def _get_rand_id(self) -> str:
264
+ """
265
+ Generate random id to be used in sproc and stage names.
266
+
267
+ Returns:
268
+ Random id string usable in sproc, table, and stage names.
269
+ """
270
+ return str(uuid4()).replace("-", "_").upper()
271
+
262
272
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
263
273
  """
264
274
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -337,7 +347,7 @@ class KNNImputer(BaseTransformer):
337
347
  cp.dump(self._sklearn_object, local_transform_file)
338
348
 
339
349
  # Create temp stage to run fit.
340
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
350
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
341
351
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
342
352
  SqlResultValidator(
343
353
  session=session,
@@ -350,11 +360,12 @@ class KNNImputer(BaseTransformer):
350
360
  expected_value=f"Stage area {transform_stage_name} successfully created."
351
361
  ).validate()
352
362
 
353
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
363
+ # Use posixpath to construct stage paths
364
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
365
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
354
366
  local_result_file_name = get_temp_file_path()
355
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
356
367
 
357
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
368
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
358
369
  statement_params = telemetry.get_function_usage_statement_params(
359
370
  project=_PROJECT,
360
371
  subproject=_SUBPROJECT,
@@ -380,6 +391,7 @@ class KNNImputer(BaseTransformer):
380
391
  replace=True,
381
392
  session=session,
382
393
  statement_params=statement_params,
394
+ anonymous=True
383
395
  )
384
396
  def fit_wrapper_sproc(
385
397
  session: Session,
@@ -388,7 +400,8 @@ class KNNImputer(BaseTransformer):
388
400
  stage_result_file_name: str,
389
401
  input_cols: List[str],
390
402
  label_cols: List[str],
391
- sample_weight_col: Optional[str]
403
+ sample_weight_col: Optional[str],
404
+ statement_params: Dict[str, str]
392
405
  ) -> str:
393
406
  import cloudpickle as cp
394
407
  import numpy as np
@@ -455,15 +468,15 @@ class KNNImputer(BaseTransformer):
455
468
  api_calls=[Session.call],
456
469
  custom_tags=dict([("autogen", True)]),
457
470
  )
458
- sproc_export_file_name = session.call(
459
- fit_sproc_name,
471
+ sproc_export_file_name = fit_wrapper_sproc(
472
+ session,
460
473
  query,
461
474
  stage_transform_file_name,
462
475
  stage_result_file_name,
463
476
  identifier.get_unescaped_names(self.input_cols),
464
477
  identifier.get_unescaped_names(self.label_cols),
465
478
  identifier.get_unescaped_names(self.sample_weight_col),
466
- statement_params=statement_params,
479
+ statement_params,
467
480
  )
468
481
 
469
482
  if "|" in sproc_export_file_name:
@@ -473,7 +486,7 @@ class KNNImputer(BaseTransformer):
473
486
  print("\n".join(fields[1:]))
474
487
 
475
488
  session.file.get(
476
- os.path.join(stage_result_file_name, sproc_export_file_name),
489
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
477
490
  local_result_file_name,
478
491
  statement_params=statement_params
479
492
  )
@@ -519,7 +532,7 @@ class KNNImputer(BaseTransformer):
519
532
 
520
533
  # Register vectorized UDF for batch inference
521
534
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
522
- safe_id=self.id, method=inference_method)
535
+ safe_id=self._get_rand_id(), method=inference_method)
523
536
 
524
537
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
525
538
  # will try to pickle all of self which fails.
@@ -611,7 +624,7 @@ class KNNImputer(BaseTransformer):
611
624
  return transformed_pandas_df.to_dict("records")
612
625
 
613
626
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
614
- safe_id=self.id
627
+ safe_id=self._get_rand_id()
615
628
  )
616
629
 
617
630
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -667,26 +680,37 @@ class KNNImputer(BaseTransformer):
667
680
  # input cols need to match unquoted / quoted
668
681
  input_cols = self.input_cols
669
682
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
683
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
670
684
 
671
685
  estimator = self._sklearn_object
672
686
 
673
- input_df = dataset[input_cols] # Select input columns with quoted column names.
674
- if hasattr(estimator, "feature_names_in_"):
675
- missing_features = []
676
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
677
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
678
- missing_features.append(f)
679
-
680
- if len(missing_features) > 0:
681
- raise ValueError(
682
- "The feature names should match with those that were passed during fit.\n"
683
- f"Features seen during fit call but not present in the input: {missing_features}\n"
684
- f"Features in the input dataframe : {input_cols}\n"
685
- )
686
- input_df.columns = getattr(estimator, "feature_names_in_")
687
- else:
688
- # Just rename the column names to unquoted identifiers.
689
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
687
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
688
+ missing_features = []
689
+ features_in_dataset = set(dataset.columns)
690
+ columns_to_select = []
691
+ for i, f in enumerate(features_required_by_estimator):
692
+ if (
693
+ i >= len(input_cols)
694
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
695
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
696
+ and quoted_input_cols[i] not in features_in_dataset)
697
+ ):
698
+ missing_features.append(f)
699
+ elif input_cols[i] in features_in_dataset:
700
+ columns_to_select.append(input_cols[i])
701
+ elif unquoted_input_cols[i] in features_in_dataset:
702
+ columns_to_select.append(unquoted_input_cols[i])
703
+ else:
704
+ columns_to_select.append(quoted_input_cols[i])
705
+
706
+ if len(missing_features) > 0:
707
+ raise ValueError(
708
+ "The feature names should match with those that were passed during fit.\n"
709
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
710
+ f"Features in the input dataframe : {input_cols}\n"
711
+ )
712
+ input_df = dataset[columns_to_select]
713
+ input_df.columns = features_required_by_estimator
690
714
 
691
715
  transformed_numpy_array = getattr(estimator, inference_method)(
692
716
  input_df
@@ -765,11 +789,18 @@ class KNNImputer(BaseTransformer):
765
789
  Transformed dataset.
766
790
  """
767
791
  if isinstance(dataset, DataFrame):
792
+ expected_type_inferred = ""
793
+ # when it is classifier, infer the datatype from label columns
794
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
795
+ expected_type_inferred = convert_sp_to_sf_type(
796
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
797
+ )
798
+
768
799
  output_df = self._batch_inference(
769
800
  dataset=dataset,
770
801
  inference_method="predict",
771
802
  expected_output_cols_list=self.output_cols,
772
- expected_output_cols_type="",
803
+ expected_output_cols_type=expected_type_inferred,
773
804
  )
774
805
  elif isinstance(dataset, pd.DataFrame):
775
806
  output_df = self._sklearn_inference(
@@ -842,10 +873,10 @@ class KNNImputer(BaseTransformer):
842
873
 
843
874
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
844
875
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
845
- Returns an empty list if current object is not a classifier or not yet fitted.
876
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
846
877
  """
847
878
  if getattr(self._sklearn_object, "classes_", None) is None:
848
- return []
879
+ return [output_cols_prefix]
849
880
 
850
881
  classes = self._sklearn_object.classes_
851
882
  if isinstance(classes, numpy.ndarray):
@@ -1070,7 +1101,7 @@ class KNNImputer(BaseTransformer):
1070
1101
  cp.dump(self._sklearn_object, local_score_file)
1071
1102
 
1072
1103
  # Create temp stage to run score.
1073
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1104
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1074
1105
  session = dataset._session
1075
1106
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1076
1107
  SqlResultValidator(
@@ -1084,8 +1115,9 @@ class KNNImputer(BaseTransformer):
1084
1115
  expected_value=f"Stage area {score_stage_name} successfully created."
1085
1116
  ).validate()
1086
1117
 
1087
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1088
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1118
+ # Use posixpath to construct stage paths
1119
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1120
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1089
1121
  statement_params = telemetry.get_function_usage_statement_params(
1090
1122
  project=_PROJECT,
1091
1123
  subproject=_SUBPROJECT,
@@ -1111,6 +1143,7 @@ class KNNImputer(BaseTransformer):
1111
1143
  replace=True,
1112
1144
  session=session,
1113
1145
  statement_params=statement_params,
1146
+ anonymous=True
1114
1147
  )
1115
1148
  def score_wrapper_sproc(
1116
1149
  session: Session,
@@ -1118,7 +1151,8 @@ class KNNImputer(BaseTransformer):
1118
1151
  stage_score_file_name: str,
1119
1152
  input_cols: List[str],
1120
1153
  label_cols: List[str],
1121
- sample_weight_col: Optional[str]
1154
+ sample_weight_col: Optional[str],
1155
+ statement_params: Dict[str, str]
1122
1156
  ) -> float:
1123
1157
  import cloudpickle as cp
1124
1158
  import numpy as np
@@ -1168,14 +1202,14 @@ class KNNImputer(BaseTransformer):
1168
1202
  api_calls=[Session.call],
1169
1203
  custom_tags=dict([("autogen", True)]),
1170
1204
  )
1171
- score = session.call(
1172
- score_sproc_name,
1205
+ score = score_wrapper_sproc(
1206
+ session,
1173
1207
  query,
1174
1208
  stage_score_file_name,
1175
1209
  identifier.get_unescaped_names(self.input_cols),
1176
1210
  identifier.get_unescaped_names(self.label_cols),
1177
1211
  identifier.get_unescaped_names(self.sample_weight_col),
1178
- statement_params=statement_params,
1212
+ statement_params,
1179
1213
  )
1180
1214
 
1181
1215
  cleanup_temp_files([local_score_file_name])
@@ -1193,18 +1227,20 @@ class KNNImputer(BaseTransformer):
1193
1227
  if self._sklearn_object._estimator_type == 'classifier':
1194
1228
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1195
1229
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1196
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1230
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1231
+ ([] if self._drop_input_cols else inputs) + outputs)
1197
1232
  # For regressor, the type of predict is float64
1198
1233
  elif self._sklearn_object._estimator_type == 'regressor':
1199
1234
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1200
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1201
-
1235
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1236
+ ([] if self._drop_input_cols else inputs) + outputs)
1202
1237
  for prob_func in PROB_FUNCTIONS:
1203
1238
  if hasattr(self, prob_func):
1204
1239
  output_cols_prefix: str = f"{prob_func}_"
1205
1240
  output_column_names = self._get_output_column_names(output_cols_prefix)
1206
1241
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1207
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1242
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1243
+ ([] if self._drop_input_cols else inputs) + outputs)
1208
1244
 
1209
1245
  @property
1210
1246
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -210,7 +212,6 @@ class MissingIndicator(BaseTransformer):
210
212
  sample_weight_col: Optional[str] = None,
211
213
  ) -> None:
212
214
  super().__init__()
213
- self.id = str(uuid4()).replace("-", "_").upper()
214
215
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
215
216
 
216
217
  self._deps = list(deps)
@@ -233,6 +234,15 @@ class MissingIndicator(BaseTransformer):
233
234
  self.set_drop_input_cols(drop_input_cols)
234
235
  self.set_sample_weight_col(sample_weight_col)
235
236
 
237
+ def _get_rand_id(self) -> str:
238
+ """
239
+ Generate random id to be used in sproc and stage names.
240
+
241
+ Returns:
242
+ Random id string usable in sproc, table, and stage names.
243
+ """
244
+ return str(uuid4()).replace("-", "_").upper()
245
+
236
246
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
237
247
  """
238
248
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -311,7 +321,7 @@ class MissingIndicator(BaseTransformer):
311
321
  cp.dump(self._sklearn_object, local_transform_file)
312
322
 
313
323
  # Create temp stage to run fit.
314
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
324
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
315
325
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
316
326
  SqlResultValidator(
317
327
  session=session,
@@ -324,11 +334,12 @@ class MissingIndicator(BaseTransformer):
324
334
  expected_value=f"Stage area {transform_stage_name} successfully created."
325
335
  ).validate()
326
336
 
327
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
337
+ # Use posixpath to construct stage paths
338
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
339
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
328
340
  local_result_file_name = get_temp_file_path()
329
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
330
341
 
331
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
342
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
332
343
  statement_params = telemetry.get_function_usage_statement_params(
333
344
  project=_PROJECT,
334
345
  subproject=_SUBPROJECT,
@@ -354,6 +365,7 @@ class MissingIndicator(BaseTransformer):
354
365
  replace=True,
355
366
  session=session,
356
367
  statement_params=statement_params,
368
+ anonymous=True
357
369
  )
358
370
  def fit_wrapper_sproc(
359
371
  session: Session,
@@ -362,7 +374,8 @@ class MissingIndicator(BaseTransformer):
362
374
  stage_result_file_name: str,
363
375
  input_cols: List[str],
364
376
  label_cols: List[str],
365
- sample_weight_col: Optional[str]
377
+ sample_weight_col: Optional[str],
378
+ statement_params: Dict[str, str]
366
379
  ) -> str:
367
380
  import cloudpickle as cp
368
381
  import numpy as np
@@ -429,15 +442,15 @@ class MissingIndicator(BaseTransformer):
429
442
  api_calls=[Session.call],
430
443
  custom_tags=dict([("autogen", True)]),
431
444
  )
432
- sproc_export_file_name = session.call(
433
- fit_sproc_name,
445
+ sproc_export_file_name = fit_wrapper_sproc(
446
+ session,
434
447
  query,
435
448
  stage_transform_file_name,
436
449
  stage_result_file_name,
437
450
  identifier.get_unescaped_names(self.input_cols),
438
451
  identifier.get_unescaped_names(self.label_cols),
439
452
  identifier.get_unescaped_names(self.sample_weight_col),
440
- statement_params=statement_params,
453
+ statement_params,
441
454
  )
442
455
 
443
456
  if "|" in sproc_export_file_name:
@@ -447,7 +460,7 @@ class MissingIndicator(BaseTransformer):
447
460
  print("\n".join(fields[1:]))
448
461
 
449
462
  session.file.get(
450
- os.path.join(stage_result_file_name, sproc_export_file_name),
463
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
451
464
  local_result_file_name,
452
465
  statement_params=statement_params
453
466
  )
@@ -493,7 +506,7 @@ class MissingIndicator(BaseTransformer):
493
506
 
494
507
  # Register vectorized UDF for batch inference
495
508
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
496
- safe_id=self.id, method=inference_method)
509
+ safe_id=self._get_rand_id(), method=inference_method)
497
510
 
498
511
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
499
512
  # will try to pickle all of self which fails.
@@ -585,7 +598,7 @@ class MissingIndicator(BaseTransformer):
585
598
  return transformed_pandas_df.to_dict("records")
586
599
 
587
600
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
588
- safe_id=self.id
601
+ safe_id=self._get_rand_id()
589
602
  )
590
603
 
591
604
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -641,26 +654,37 @@ class MissingIndicator(BaseTransformer):
641
654
  # input cols need to match unquoted / quoted
642
655
  input_cols = self.input_cols
643
656
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
657
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
644
658
 
645
659
  estimator = self._sklearn_object
646
660
 
647
- input_df = dataset[input_cols] # Select input columns with quoted column names.
648
- if hasattr(estimator, "feature_names_in_"):
649
- missing_features = []
650
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
651
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
652
- missing_features.append(f)
653
-
654
- if len(missing_features) > 0:
655
- raise ValueError(
656
- "The feature names should match with those that were passed during fit.\n"
657
- f"Features seen during fit call but not present in the input: {missing_features}\n"
658
- f"Features in the input dataframe : {input_cols}\n"
659
- )
660
- input_df.columns = getattr(estimator, "feature_names_in_")
661
- else:
662
- # Just rename the column names to unquoted identifiers.
663
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
661
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
662
+ missing_features = []
663
+ features_in_dataset = set(dataset.columns)
664
+ columns_to_select = []
665
+ for i, f in enumerate(features_required_by_estimator):
666
+ if (
667
+ i >= len(input_cols)
668
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
669
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
670
+ and quoted_input_cols[i] not in features_in_dataset)
671
+ ):
672
+ missing_features.append(f)
673
+ elif input_cols[i] in features_in_dataset:
674
+ columns_to_select.append(input_cols[i])
675
+ elif unquoted_input_cols[i] in features_in_dataset:
676
+ columns_to_select.append(unquoted_input_cols[i])
677
+ else:
678
+ columns_to_select.append(quoted_input_cols[i])
679
+
680
+ if len(missing_features) > 0:
681
+ raise ValueError(
682
+ "The feature names should match with those that were passed during fit.\n"
683
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
684
+ f"Features in the input dataframe : {input_cols}\n"
685
+ )
686
+ input_df = dataset[columns_to_select]
687
+ input_df.columns = features_required_by_estimator
664
688
 
665
689
  transformed_numpy_array = getattr(estimator, inference_method)(
666
690
  input_df
@@ -739,11 +763,18 @@ class MissingIndicator(BaseTransformer):
739
763
  Transformed dataset.
740
764
  """
741
765
  if isinstance(dataset, DataFrame):
766
+ expected_type_inferred = ""
767
+ # when it is classifier, infer the datatype from label columns
768
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
769
+ expected_type_inferred = convert_sp_to_sf_type(
770
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
771
+ )
772
+
742
773
  output_df = self._batch_inference(
743
774
  dataset=dataset,
744
775
  inference_method="predict",
745
776
  expected_output_cols_list=self.output_cols,
746
- expected_output_cols_type="",
777
+ expected_output_cols_type=expected_type_inferred,
747
778
  )
748
779
  elif isinstance(dataset, pd.DataFrame):
749
780
  output_df = self._sklearn_inference(
@@ -816,10 +847,10 @@ class MissingIndicator(BaseTransformer):
816
847
 
817
848
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
818
849
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
819
- Returns an empty list if current object is not a classifier or not yet fitted.
850
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
820
851
  """
821
852
  if getattr(self._sklearn_object, "classes_", None) is None:
822
- return []
853
+ return [output_cols_prefix]
823
854
 
824
855
  classes = self._sklearn_object.classes_
825
856
  if isinstance(classes, numpy.ndarray):
@@ -1044,7 +1075,7 @@ class MissingIndicator(BaseTransformer):
1044
1075
  cp.dump(self._sklearn_object, local_score_file)
1045
1076
 
1046
1077
  # Create temp stage to run score.
1047
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1078
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1048
1079
  session = dataset._session
1049
1080
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1050
1081
  SqlResultValidator(
@@ -1058,8 +1089,9 @@ class MissingIndicator(BaseTransformer):
1058
1089
  expected_value=f"Stage area {score_stage_name} successfully created."
1059
1090
  ).validate()
1060
1091
 
1061
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1062
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1092
+ # Use posixpath to construct stage paths
1093
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1094
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1063
1095
  statement_params = telemetry.get_function_usage_statement_params(
1064
1096
  project=_PROJECT,
1065
1097
  subproject=_SUBPROJECT,
@@ -1085,6 +1117,7 @@ class MissingIndicator(BaseTransformer):
1085
1117
  replace=True,
1086
1118
  session=session,
1087
1119
  statement_params=statement_params,
1120
+ anonymous=True
1088
1121
  )
1089
1122
  def score_wrapper_sproc(
1090
1123
  session: Session,
@@ -1092,7 +1125,8 @@ class MissingIndicator(BaseTransformer):
1092
1125
  stage_score_file_name: str,
1093
1126
  input_cols: List[str],
1094
1127
  label_cols: List[str],
1095
- sample_weight_col: Optional[str]
1128
+ sample_weight_col: Optional[str],
1129
+ statement_params: Dict[str, str]
1096
1130
  ) -> float:
1097
1131
  import cloudpickle as cp
1098
1132
  import numpy as np
@@ -1142,14 +1176,14 @@ class MissingIndicator(BaseTransformer):
1142
1176
  api_calls=[Session.call],
1143
1177
  custom_tags=dict([("autogen", True)]),
1144
1178
  )
1145
- score = session.call(
1146
- score_sproc_name,
1179
+ score = score_wrapper_sproc(
1180
+ session,
1147
1181
  query,
1148
1182
  stage_score_file_name,
1149
1183
  identifier.get_unescaped_names(self.input_cols),
1150
1184
  identifier.get_unescaped_names(self.label_cols),
1151
1185
  identifier.get_unescaped_names(self.sample_weight_col),
1152
- statement_params=statement_params,
1186
+ statement_params,
1153
1187
  )
1154
1188
 
1155
1189
  cleanup_temp_files([local_score_file_name])
@@ -1167,18 +1201,20 @@ class MissingIndicator(BaseTransformer):
1167
1201
  if self._sklearn_object._estimator_type == 'classifier':
1168
1202
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1169
1203
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1170
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1204
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1205
+ ([] if self._drop_input_cols else inputs) + outputs)
1171
1206
  # For regressor, the type of predict is float64
1172
1207
  elif self._sklearn_object._estimator_type == 'regressor':
1173
1208
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1174
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1175
-
1209
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1210
+ ([] if self._drop_input_cols else inputs) + outputs)
1176
1211
  for prob_func in PROB_FUNCTIONS:
1177
1212
  if hasattr(self, prob_func):
1178
1213
  output_cols_prefix: str = f"{prob_func}_"
1179
1214
  output_column_names = self._get_output_column_names(output_cols_prefix)
1180
1215
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1181
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1216
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1217
+ ([] if self._drop_input_cols else inputs) + outputs)
1182
1218
 
1183
1219
  @property
1184
1220
  def model_signatures(self) -> Dict[str, ModelSignature]: