snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -233,7 +235,6 @@ class KernelDensity(BaseTransformer):
233
235
  sample_weight_col: Optional[str] = None,
234
236
  ) -> None:
235
237
  super().__init__()
236
- self.id = str(uuid4()).replace("-", "_").upper()
237
238
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
238
239
 
239
240
  self._deps = list(deps)
@@ -261,6 +262,15 @@ class KernelDensity(BaseTransformer):
261
262
  self.set_drop_input_cols(drop_input_cols)
262
263
  self.set_sample_weight_col(sample_weight_col)
263
264
 
265
+ def _get_rand_id(self) -> str:
266
+ """
267
+ Generate random id to be used in sproc and stage names.
268
+
269
+ Returns:
270
+ Random id string usable in sproc, table, and stage names.
271
+ """
272
+ return str(uuid4()).replace("-", "_").upper()
273
+
264
274
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
265
275
  """
266
276
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -339,7 +349,7 @@ class KernelDensity(BaseTransformer):
339
349
  cp.dump(self._sklearn_object, local_transform_file)
340
350
 
341
351
  # Create temp stage to run fit.
342
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
352
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
343
353
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
344
354
  SqlResultValidator(
345
355
  session=session,
@@ -352,11 +362,12 @@ class KernelDensity(BaseTransformer):
352
362
  expected_value=f"Stage area {transform_stage_name} successfully created."
353
363
  ).validate()
354
364
 
355
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
365
+ # Use posixpath to construct stage paths
366
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
367
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
356
368
  local_result_file_name = get_temp_file_path()
357
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
358
369
 
359
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
370
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
360
371
  statement_params = telemetry.get_function_usage_statement_params(
361
372
  project=_PROJECT,
362
373
  subproject=_SUBPROJECT,
@@ -382,6 +393,7 @@ class KernelDensity(BaseTransformer):
382
393
  replace=True,
383
394
  session=session,
384
395
  statement_params=statement_params,
396
+ anonymous=True
385
397
  )
386
398
  def fit_wrapper_sproc(
387
399
  session: Session,
@@ -390,7 +402,8 @@ class KernelDensity(BaseTransformer):
390
402
  stage_result_file_name: str,
391
403
  input_cols: List[str],
392
404
  label_cols: List[str],
393
- sample_weight_col: Optional[str]
405
+ sample_weight_col: Optional[str],
406
+ statement_params: Dict[str, str]
394
407
  ) -> str:
395
408
  import cloudpickle as cp
396
409
  import numpy as np
@@ -457,15 +470,15 @@ class KernelDensity(BaseTransformer):
457
470
  api_calls=[Session.call],
458
471
  custom_tags=dict([("autogen", True)]),
459
472
  )
460
- sproc_export_file_name = session.call(
461
- fit_sproc_name,
473
+ sproc_export_file_name = fit_wrapper_sproc(
474
+ session,
462
475
  query,
463
476
  stage_transform_file_name,
464
477
  stage_result_file_name,
465
478
  identifier.get_unescaped_names(self.input_cols),
466
479
  identifier.get_unescaped_names(self.label_cols),
467
480
  identifier.get_unescaped_names(self.sample_weight_col),
468
- statement_params=statement_params,
481
+ statement_params,
469
482
  )
470
483
 
471
484
  if "|" in sproc_export_file_name:
@@ -475,7 +488,7 @@ class KernelDensity(BaseTransformer):
475
488
  print("\n".join(fields[1:]))
476
489
 
477
490
  session.file.get(
478
- os.path.join(stage_result_file_name, sproc_export_file_name),
491
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
479
492
  local_result_file_name,
480
493
  statement_params=statement_params
481
494
  )
@@ -521,7 +534,7 @@ class KernelDensity(BaseTransformer):
521
534
 
522
535
  # Register vectorized UDF for batch inference
523
536
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
524
- safe_id=self.id, method=inference_method)
537
+ safe_id=self._get_rand_id(), method=inference_method)
525
538
 
526
539
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
527
540
  # will try to pickle all of self which fails.
@@ -613,7 +626,7 @@ class KernelDensity(BaseTransformer):
613
626
  return transformed_pandas_df.to_dict("records")
614
627
 
615
628
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
616
- safe_id=self.id
629
+ safe_id=self._get_rand_id()
617
630
  )
618
631
 
619
632
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -669,26 +682,37 @@ class KernelDensity(BaseTransformer):
669
682
  # input cols need to match unquoted / quoted
670
683
  input_cols = self.input_cols
671
684
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
685
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
672
686
 
673
687
  estimator = self._sklearn_object
674
688
 
675
- input_df = dataset[input_cols] # Select input columns with quoted column names.
676
- if hasattr(estimator, "feature_names_in_"):
677
- missing_features = []
678
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
679
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
680
- missing_features.append(f)
681
-
682
- if len(missing_features) > 0:
683
- raise ValueError(
684
- "The feature names should match with those that were passed during fit.\n"
685
- f"Features seen during fit call but not present in the input: {missing_features}\n"
686
- f"Features in the input dataframe : {input_cols}\n"
687
- )
688
- input_df.columns = getattr(estimator, "feature_names_in_")
689
- else:
690
- # Just rename the column names to unquoted identifiers.
691
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
689
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
690
+ missing_features = []
691
+ features_in_dataset = set(dataset.columns)
692
+ columns_to_select = []
693
+ for i, f in enumerate(features_required_by_estimator):
694
+ if (
695
+ i >= len(input_cols)
696
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
697
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
698
+ and quoted_input_cols[i] not in features_in_dataset)
699
+ ):
700
+ missing_features.append(f)
701
+ elif input_cols[i] in features_in_dataset:
702
+ columns_to_select.append(input_cols[i])
703
+ elif unquoted_input_cols[i] in features_in_dataset:
704
+ columns_to_select.append(unquoted_input_cols[i])
705
+ else:
706
+ columns_to_select.append(quoted_input_cols[i])
707
+
708
+ if len(missing_features) > 0:
709
+ raise ValueError(
710
+ "The feature names should match with those that were passed during fit.\n"
711
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
712
+ f"Features in the input dataframe : {input_cols}\n"
713
+ )
714
+ input_df = dataset[columns_to_select]
715
+ input_df.columns = features_required_by_estimator
692
716
 
693
717
  transformed_numpy_array = getattr(estimator, inference_method)(
694
718
  input_df
@@ -767,11 +791,18 @@ class KernelDensity(BaseTransformer):
767
791
  Transformed dataset.
768
792
  """
769
793
  if isinstance(dataset, DataFrame):
794
+ expected_type_inferred = ""
795
+ # when it is classifier, infer the datatype from label columns
796
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
797
+ expected_type_inferred = convert_sp_to_sf_type(
798
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
799
+ )
800
+
770
801
  output_df = self._batch_inference(
771
802
  dataset=dataset,
772
803
  inference_method="predict",
773
804
  expected_output_cols_list=self.output_cols,
774
- expected_output_cols_type="",
805
+ expected_output_cols_type=expected_type_inferred,
775
806
  )
776
807
  elif isinstance(dataset, pd.DataFrame):
777
808
  output_df = self._sklearn_inference(
@@ -842,10 +873,10 @@ class KernelDensity(BaseTransformer):
842
873
 
843
874
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
844
875
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
845
- Returns an empty list if current object is not a classifier or not yet fitted.
876
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
846
877
  """
847
878
  if getattr(self._sklearn_object, "classes_", None) is None:
848
- return []
879
+ return [output_cols_prefix]
849
880
 
850
881
  classes = self._sklearn_object.classes_
851
882
  if isinstance(classes, numpy.ndarray):
@@ -1070,7 +1101,7 @@ class KernelDensity(BaseTransformer):
1070
1101
  cp.dump(self._sklearn_object, local_score_file)
1071
1102
 
1072
1103
  # Create temp stage to run score.
1073
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1104
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1074
1105
  session = dataset._session
1075
1106
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1076
1107
  SqlResultValidator(
@@ -1084,8 +1115,9 @@ class KernelDensity(BaseTransformer):
1084
1115
  expected_value=f"Stage area {score_stage_name} successfully created."
1085
1116
  ).validate()
1086
1117
 
1087
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1088
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1118
+ # Use posixpath to construct stage paths
1119
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1120
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1089
1121
  statement_params = telemetry.get_function_usage_statement_params(
1090
1122
  project=_PROJECT,
1091
1123
  subproject=_SUBPROJECT,
@@ -1111,6 +1143,7 @@ class KernelDensity(BaseTransformer):
1111
1143
  replace=True,
1112
1144
  session=session,
1113
1145
  statement_params=statement_params,
1146
+ anonymous=True
1114
1147
  )
1115
1148
  def score_wrapper_sproc(
1116
1149
  session: Session,
@@ -1118,7 +1151,8 @@ class KernelDensity(BaseTransformer):
1118
1151
  stage_score_file_name: str,
1119
1152
  input_cols: List[str],
1120
1153
  label_cols: List[str],
1121
- sample_weight_col: Optional[str]
1154
+ sample_weight_col: Optional[str],
1155
+ statement_params: Dict[str, str]
1122
1156
  ) -> float:
1123
1157
  import cloudpickle as cp
1124
1158
  import numpy as np
@@ -1168,14 +1202,14 @@ class KernelDensity(BaseTransformer):
1168
1202
  api_calls=[Session.call],
1169
1203
  custom_tags=dict([("autogen", True)]),
1170
1204
  )
1171
- score = session.call(
1172
- score_sproc_name,
1205
+ score = score_wrapper_sproc(
1206
+ session,
1173
1207
  query,
1174
1208
  stage_score_file_name,
1175
1209
  identifier.get_unescaped_names(self.input_cols),
1176
1210
  identifier.get_unescaped_names(self.label_cols),
1177
1211
  identifier.get_unescaped_names(self.sample_weight_col),
1178
- statement_params=statement_params,
1212
+ statement_params,
1179
1213
  )
1180
1214
 
1181
1215
  cleanup_temp_files([local_score_file_name])
@@ -1193,18 +1227,20 @@ class KernelDensity(BaseTransformer):
1193
1227
  if self._sklearn_object._estimator_type == 'classifier':
1194
1228
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1195
1229
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1196
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1230
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1231
+ ([] if self._drop_input_cols else inputs) + outputs)
1197
1232
  # For regressor, the type of predict is float64
1198
1233
  elif self._sklearn_object._estimator_type == 'regressor':
1199
1234
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1200
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1201
-
1235
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1236
+ ([] if self._drop_input_cols else inputs) + outputs)
1202
1237
  for prob_func in PROB_FUNCTIONS:
1203
1238
  if hasattr(self, prob_func):
1204
1239
  output_cols_prefix: str = f"{prob_func}_"
1205
1240
  output_column_names = self._get_output_column_names(output_cols_prefix)
1206
1241
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1207
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1242
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1243
+ ([] if self._drop_input_cols else inputs) + outputs)
1208
1244
 
1209
1245
  @property
1210
1246
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -261,7 +263,6 @@ class LocalOutlierFactor(BaseTransformer):
261
263
  sample_weight_col: Optional[str] = None,
262
264
  ) -> None:
263
265
  super().__init__()
264
- self.id = str(uuid4()).replace("-", "_").upper()
265
266
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
266
267
 
267
268
  self._deps = list(deps)
@@ -289,6 +290,15 @@ class LocalOutlierFactor(BaseTransformer):
289
290
  self.set_drop_input_cols(drop_input_cols)
290
291
  self.set_sample_weight_col(sample_weight_col)
291
292
 
293
+ def _get_rand_id(self) -> str:
294
+ """
295
+ Generate random id to be used in sproc and stage names.
296
+
297
+ Returns:
298
+ Random id string usable in sproc, table, and stage names.
299
+ """
300
+ return str(uuid4()).replace("-", "_").upper()
301
+
292
302
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
293
303
  """
294
304
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -367,7 +377,7 @@ class LocalOutlierFactor(BaseTransformer):
367
377
  cp.dump(self._sklearn_object, local_transform_file)
368
378
 
369
379
  # Create temp stage to run fit.
370
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
380
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
371
381
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
372
382
  SqlResultValidator(
373
383
  session=session,
@@ -380,11 +390,12 @@ class LocalOutlierFactor(BaseTransformer):
380
390
  expected_value=f"Stage area {transform_stage_name} successfully created."
381
391
  ).validate()
382
392
 
383
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
393
+ # Use posixpath to construct stage paths
394
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
395
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
384
396
  local_result_file_name = get_temp_file_path()
385
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
386
397
 
387
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
398
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
388
399
  statement_params = telemetry.get_function_usage_statement_params(
389
400
  project=_PROJECT,
390
401
  subproject=_SUBPROJECT,
@@ -410,6 +421,7 @@ class LocalOutlierFactor(BaseTransformer):
410
421
  replace=True,
411
422
  session=session,
412
423
  statement_params=statement_params,
424
+ anonymous=True
413
425
  )
414
426
  def fit_wrapper_sproc(
415
427
  session: Session,
@@ -418,7 +430,8 @@ class LocalOutlierFactor(BaseTransformer):
418
430
  stage_result_file_name: str,
419
431
  input_cols: List[str],
420
432
  label_cols: List[str],
421
- sample_weight_col: Optional[str]
433
+ sample_weight_col: Optional[str],
434
+ statement_params: Dict[str, str]
422
435
  ) -> str:
423
436
  import cloudpickle as cp
424
437
  import numpy as np
@@ -485,15 +498,15 @@ class LocalOutlierFactor(BaseTransformer):
485
498
  api_calls=[Session.call],
486
499
  custom_tags=dict([("autogen", True)]),
487
500
  )
488
- sproc_export_file_name = session.call(
489
- fit_sproc_name,
501
+ sproc_export_file_name = fit_wrapper_sproc(
502
+ session,
490
503
  query,
491
504
  stage_transform_file_name,
492
505
  stage_result_file_name,
493
506
  identifier.get_unescaped_names(self.input_cols),
494
507
  identifier.get_unescaped_names(self.label_cols),
495
508
  identifier.get_unescaped_names(self.sample_weight_col),
496
- statement_params=statement_params,
509
+ statement_params,
497
510
  )
498
511
 
499
512
  if "|" in sproc_export_file_name:
@@ -503,7 +516,7 @@ class LocalOutlierFactor(BaseTransformer):
503
516
  print("\n".join(fields[1:]))
504
517
 
505
518
  session.file.get(
506
- os.path.join(stage_result_file_name, sproc_export_file_name),
519
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
507
520
  local_result_file_name,
508
521
  statement_params=statement_params
509
522
  )
@@ -549,7 +562,7 @@ class LocalOutlierFactor(BaseTransformer):
549
562
 
550
563
  # Register vectorized UDF for batch inference
551
564
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
552
- safe_id=self.id, method=inference_method)
565
+ safe_id=self._get_rand_id(), method=inference_method)
553
566
 
554
567
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
555
568
  # will try to pickle all of self which fails.
@@ -641,7 +654,7 @@ class LocalOutlierFactor(BaseTransformer):
641
654
  return transformed_pandas_df.to_dict("records")
642
655
 
643
656
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
644
- safe_id=self.id
657
+ safe_id=self._get_rand_id()
645
658
  )
646
659
 
647
660
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -697,26 +710,37 @@ class LocalOutlierFactor(BaseTransformer):
697
710
  # input cols need to match unquoted / quoted
698
711
  input_cols = self.input_cols
699
712
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
713
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
700
714
 
701
715
  estimator = self._sklearn_object
702
716
 
703
- input_df = dataset[input_cols] # Select input columns with quoted column names.
704
- if hasattr(estimator, "feature_names_in_"):
705
- missing_features = []
706
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
707
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
708
- missing_features.append(f)
709
-
710
- if len(missing_features) > 0:
711
- raise ValueError(
712
- "The feature names should match with those that were passed during fit.\n"
713
- f"Features seen during fit call but not present in the input: {missing_features}\n"
714
- f"Features in the input dataframe : {input_cols}\n"
715
- )
716
- input_df.columns = getattr(estimator, "feature_names_in_")
717
- else:
718
- # Just rename the column names to unquoted identifiers.
719
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
717
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
718
+ missing_features = []
719
+ features_in_dataset = set(dataset.columns)
720
+ columns_to_select = []
721
+ for i, f in enumerate(features_required_by_estimator):
722
+ if (
723
+ i >= len(input_cols)
724
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
725
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
726
+ and quoted_input_cols[i] not in features_in_dataset)
727
+ ):
728
+ missing_features.append(f)
729
+ elif input_cols[i] in features_in_dataset:
730
+ columns_to_select.append(input_cols[i])
731
+ elif unquoted_input_cols[i] in features_in_dataset:
732
+ columns_to_select.append(unquoted_input_cols[i])
733
+ else:
734
+ columns_to_select.append(quoted_input_cols[i])
735
+
736
+ if len(missing_features) > 0:
737
+ raise ValueError(
738
+ "The feature names should match with those that were passed during fit.\n"
739
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
740
+ f"Features in the input dataframe : {input_cols}\n"
741
+ )
742
+ input_df = dataset[columns_to_select]
743
+ input_df.columns = features_required_by_estimator
720
744
 
721
745
  transformed_numpy_array = getattr(estimator, inference_method)(
722
746
  input_df
@@ -797,11 +821,18 @@ class LocalOutlierFactor(BaseTransformer):
797
821
  Transformed dataset.
798
822
  """
799
823
  if isinstance(dataset, DataFrame):
824
+ expected_type_inferred = ""
825
+ # when it is classifier, infer the datatype from label columns
826
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
827
+ expected_type_inferred = convert_sp_to_sf_type(
828
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
829
+ )
830
+
800
831
  output_df = self._batch_inference(
801
832
  dataset=dataset,
802
833
  inference_method="predict",
803
834
  expected_output_cols_list=self.output_cols,
804
- expected_output_cols_type="",
835
+ expected_output_cols_type=expected_type_inferred,
805
836
  )
806
837
  elif isinstance(dataset, pd.DataFrame):
807
838
  output_df = self._sklearn_inference(
@@ -872,10 +903,10 @@ class LocalOutlierFactor(BaseTransformer):
872
903
 
873
904
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
874
905
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
875
- Returns an empty list if current object is not a classifier or not yet fitted.
906
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
876
907
  """
877
908
  if getattr(self._sklearn_object, "classes_", None) is None:
878
- return []
909
+ return [output_cols_prefix]
879
910
 
880
911
  classes = self._sklearn_object.classes_
881
912
  if isinstance(classes, numpy.ndarray):
@@ -1102,7 +1133,7 @@ class LocalOutlierFactor(BaseTransformer):
1102
1133
  cp.dump(self._sklearn_object, local_score_file)
1103
1134
 
1104
1135
  # Create temp stage to run score.
1105
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1136
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1106
1137
  session = dataset._session
1107
1138
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1108
1139
  SqlResultValidator(
@@ -1116,8 +1147,9 @@ class LocalOutlierFactor(BaseTransformer):
1116
1147
  expected_value=f"Stage area {score_stage_name} successfully created."
1117
1148
  ).validate()
1118
1149
 
1119
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1120
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1150
+ # Use posixpath to construct stage paths
1151
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1152
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1121
1153
  statement_params = telemetry.get_function_usage_statement_params(
1122
1154
  project=_PROJECT,
1123
1155
  subproject=_SUBPROJECT,
@@ -1143,6 +1175,7 @@ class LocalOutlierFactor(BaseTransformer):
1143
1175
  replace=True,
1144
1176
  session=session,
1145
1177
  statement_params=statement_params,
1178
+ anonymous=True
1146
1179
  )
1147
1180
  def score_wrapper_sproc(
1148
1181
  session: Session,
@@ -1150,7 +1183,8 @@ class LocalOutlierFactor(BaseTransformer):
1150
1183
  stage_score_file_name: str,
1151
1184
  input_cols: List[str],
1152
1185
  label_cols: List[str],
1153
- sample_weight_col: Optional[str]
1186
+ sample_weight_col: Optional[str],
1187
+ statement_params: Dict[str, str]
1154
1188
  ) -> float:
1155
1189
  import cloudpickle as cp
1156
1190
  import numpy as np
@@ -1200,14 +1234,14 @@ class LocalOutlierFactor(BaseTransformer):
1200
1234
  api_calls=[Session.call],
1201
1235
  custom_tags=dict([("autogen", True)]),
1202
1236
  )
1203
- score = session.call(
1204
- score_sproc_name,
1237
+ score = score_wrapper_sproc(
1238
+ session,
1205
1239
  query,
1206
1240
  stage_score_file_name,
1207
1241
  identifier.get_unescaped_names(self.input_cols),
1208
1242
  identifier.get_unescaped_names(self.label_cols),
1209
1243
  identifier.get_unescaped_names(self.sample_weight_col),
1210
- statement_params=statement_params,
1244
+ statement_params,
1211
1245
  )
1212
1246
 
1213
1247
  cleanup_temp_files([local_score_file_name])
@@ -1225,18 +1259,20 @@ class LocalOutlierFactor(BaseTransformer):
1225
1259
  if self._sklearn_object._estimator_type == 'classifier':
1226
1260
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1227
1261
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1228
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1262
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1263
+ ([] if self._drop_input_cols else inputs) + outputs)
1229
1264
  # For regressor, the type of predict is float64
1230
1265
  elif self._sklearn_object._estimator_type == 'regressor':
1231
1266
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1232
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1233
-
1267
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1268
+ ([] if self._drop_input_cols else inputs) + outputs)
1234
1269
  for prob_func in PROB_FUNCTIONS:
1235
1270
  if hasattr(self, prob_func):
1236
1271
  output_cols_prefix: str = f"{prob_func}_"
1237
1272
  output_column_names = self._get_output_column_names(output_cols_prefix)
1238
1273
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1239
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1274
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1275
+ ([] if self._drop_input_cols else inputs) + outputs)
1240
1276
 
1241
1277
  @property
1242
1278
  def model_signatures(self) -> Dict[str, ModelSignature]: