snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -255,7 +257,6 @@ class RadiusNeighborsRegressor(BaseTransformer):
255
257
  sample_weight_col: Optional[str] = None,
256
258
  ) -> None:
257
259
  super().__init__()
258
- self.id = str(uuid4()).replace("-", "_").upper()
259
260
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
260
261
 
261
262
  self._deps = list(deps)
@@ -282,6 +283,15 @@ class RadiusNeighborsRegressor(BaseTransformer):
282
283
  self.set_drop_input_cols(drop_input_cols)
283
284
  self.set_sample_weight_col(sample_weight_col)
284
285
 
286
+ def _get_rand_id(self) -> str:
287
+ """
288
+ Generate random id to be used in sproc and stage names.
289
+
290
+ Returns:
291
+ Random id string usable in sproc, table, and stage names.
292
+ """
293
+ return str(uuid4()).replace("-", "_").upper()
294
+
285
295
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
286
296
  """
287
297
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -360,7 +370,7 @@ class RadiusNeighborsRegressor(BaseTransformer):
360
370
  cp.dump(self._sklearn_object, local_transform_file)
361
371
 
362
372
  # Create temp stage to run fit.
363
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
373
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
364
374
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
365
375
  SqlResultValidator(
366
376
  session=session,
@@ -373,11 +383,12 @@ class RadiusNeighborsRegressor(BaseTransformer):
373
383
  expected_value=f"Stage area {transform_stage_name} successfully created."
374
384
  ).validate()
375
385
 
376
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
386
+ # Use posixpath to construct stage paths
387
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
388
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
377
389
  local_result_file_name = get_temp_file_path()
378
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
379
390
 
380
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
391
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
381
392
  statement_params = telemetry.get_function_usage_statement_params(
382
393
  project=_PROJECT,
383
394
  subproject=_SUBPROJECT,
@@ -403,6 +414,7 @@ class RadiusNeighborsRegressor(BaseTransformer):
403
414
  replace=True,
404
415
  session=session,
405
416
  statement_params=statement_params,
417
+ anonymous=True
406
418
  )
407
419
  def fit_wrapper_sproc(
408
420
  session: Session,
@@ -411,7 +423,8 @@ class RadiusNeighborsRegressor(BaseTransformer):
411
423
  stage_result_file_name: str,
412
424
  input_cols: List[str],
413
425
  label_cols: List[str],
414
- sample_weight_col: Optional[str]
426
+ sample_weight_col: Optional[str],
427
+ statement_params: Dict[str, str]
415
428
  ) -> str:
416
429
  import cloudpickle as cp
417
430
  import numpy as np
@@ -478,15 +491,15 @@ class RadiusNeighborsRegressor(BaseTransformer):
478
491
  api_calls=[Session.call],
479
492
  custom_tags=dict([("autogen", True)]),
480
493
  )
481
- sproc_export_file_name = session.call(
482
- fit_sproc_name,
494
+ sproc_export_file_name = fit_wrapper_sproc(
495
+ session,
483
496
  query,
484
497
  stage_transform_file_name,
485
498
  stage_result_file_name,
486
499
  identifier.get_unescaped_names(self.input_cols),
487
500
  identifier.get_unescaped_names(self.label_cols),
488
501
  identifier.get_unescaped_names(self.sample_weight_col),
489
- statement_params=statement_params,
502
+ statement_params,
490
503
  )
491
504
 
492
505
  if "|" in sproc_export_file_name:
@@ -496,7 +509,7 @@ class RadiusNeighborsRegressor(BaseTransformer):
496
509
  print("\n".join(fields[1:]))
497
510
 
498
511
  session.file.get(
499
- os.path.join(stage_result_file_name, sproc_export_file_name),
512
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
500
513
  local_result_file_name,
501
514
  statement_params=statement_params
502
515
  )
@@ -542,7 +555,7 @@ class RadiusNeighborsRegressor(BaseTransformer):
542
555
 
543
556
  # Register vectorized UDF for batch inference
544
557
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
545
- safe_id=self.id, method=inference_method)
558
+ safe_id=self._get_rand_id(), method=inference_method)
546
559
 
547
560
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
548
561
  # will try to pickle all of self which fails.
@@ -634,7 +647,7 @@ class RadiusNeighborsRegressor(BaseTransformer):
634
647
  return transformed_pandas_df.to_dict("records")
635
648
 
636
649
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
637
- safe_id=self.id
650
+ safe_id=self._get_rand_id()
638
651
  )
639
652
 
640
653
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -690,26 +703,37 @@ class RadiusNeighborsRegressor(BaseTransformer):
690
703
  # input cols need to match unquoted / quoted
691
704
  input_cols = self.input_cols
692
705
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
706
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
693
707
 
694
708
  estimator = self._sklearn_object
695
709
 
696
- input_df = dataset[input_cols] # Select input columns with quoted column names.
697
- if hasattr(estimator, "feature_names_in_"):
698
- missing_features = []
699
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
700
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
701
- missing_features.append(f)
702
-
703
- if len(missing_features) > 0:
704
- raise ValueError(
705
- "The feature names should match with those that were passed during fit.\n"
706
- f"Features seen during fit call but not present in the input: {missing_features}\n"
707
- f"Features in the input dataframe : {input_cols}\n"
708
- )
709
- input_df.columns = getattr(estimator, "feature_names_in_")
710
- else:
711
- # Just rename the column names to unquoted identifiers.
712
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
710
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
711
+ missing_features = []
712
+ features_in_dataset = set(dataset.columns)
713
+ columns_to_select = []
714
+ for i, f in enumerate(features_required_by_estimator):
715
+ if (
716
+ i >= len(input_cols)
717
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
718
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
719
+ and quoted_input_cols[i] not in features_in_dataset)
720
+ ):
721
+ missing_features.append(f)
722
+ elif input_cols[i] in features_in_dataset:
723
+ columns_to_select.append(input_cols[i])
724
+ elif unquoted_input_cols[i] in features_in_dataset:
725
+ columns_to_select.append(unquoted_input_cols[i])
726
+ else:
727
+ columns_to_select.append(quoted_input_cols[i])
728
+
729
+ if len(missing_features) > 0:
730
+ raise ValueError(
731
+ "The feature names should match with those that were passed during fit.\n"
732
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
733
+ f"Features in the input dataframe : {input_cols}\n"
734
+ )
735
+ input_df = dataset[columns_to_select]
736
+ input_df.columns = features_required_by_estimator
713
737
 
714
738
  transformed_numpy_array = getattr(estimator, inference_method)(
715
739
  input_df
@@ -790,11 +814,18 @@ class RadiusNeighborsRegressor(BaseTransformer):
790
814
  Transformed dataset.
791
815
  """
792
816
  if isinstance(dataset, DataFrame):
817
+ expected_type_inferred = "float"
818
+ # when it is classifier, infer the datatype from label columns
819
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
820
+ expected_type_inferred = convert_sp_to_sf_type(
821
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
822
+ )
823
+
793
824
  output_df = self._batch_inference(
794
825
  dataset=dataset,
795
826
  inference_method="predict",
796
827
  expected_output_cols_list=self.output_cols,
797
- expected_output_cols_type="float",
828
+ expected_output_cols_type=expected_type_inferred,
798
829
  )
799
830
  elif isinstance(dataset, pd.DataFrame):
800
831
  output_df = self._sklearn_inference(
@@ -865,10 +896,10 @@ class RadiusNeighborsRegressor(BaseTransformer):
865
896
 
866
897
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
867
898
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
868
- Returns an empty list if current object is not a classifier or not yet fitted.
899
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
869
900
  """
870
901
  if getattr(self._sklearn_object, "classes_", None) is None:
871
- return []
902
+ return [output_cols_prefix]
872
903
 
873
904
  classes = self._sklearn_object.classes_
874
905
  if isinstance(classes, numpy.ndarray):
@@ -1093,7 +1124,7 @@ class RadiusNeighborsRegressor(BaseTransformer):
1093
1124
  cp.dump(self._sklearn_object, local_score_file)
1094
1125
 
1095
1126
  # Create temp stage to run score.
1096
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1127
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1097
1128
  session = dataset._session
1098
1129
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1099
1130
  SqlResultValidator(
@@ -1107,8 +1138,9 @@ class RadiusNeighborsRegressor(BaseTransformer):
1107
1138
  expected_value=f"Stage area {score_stage_name} successfully created."
1108
1139
  ).validate()
1109
1140
 
1110
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1111
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1141
+ # Use posixpath to construct stage paths
1142
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1143
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1112
1144
  statement_params = telemetry.get_function_usage_statement_params(
1113
1145
  project=_PROJECT,
1114
1146
  subproject=_SUBPROJECT,
@@ -1134,6 +1166,7 @@ class RadiusNeighborsRegressor(BaseTransformer):
1134
1166
  replace=True,
1135
1167
  session=session,
1136
1168
  statement_params=statement_params,
1169
+ anonymous=True
1137
1170
  )
1138
1171
  def score_wrapper_sproc(
1139
1172
  session: Session,
@@ -1141,7 +1174,8 @@ class RadiusNeighborsRegressor(BaseTransformer):
1141
1174
  stage_score_file_name: str,
1142
1175
  input_cols: List[str],
1143
1176
  label_cols: List[str],
1144
- sample_weight_col: Optional[str]
1177
+ sample_weight_col: Optional[str],
1178
+ statement_params: Dict[str, str]
1145
1179
  ) -> float:
1146
1180
  import cloudpickle as cp
1147
1181
  import numpy as np
@@ -1191,14 +1225,14 @@ class RadiusNeighborsRegressor(BaseTransformer):
1191
1225
  api_calls=[Session.call],
1192
1226
  custom_tags=dict([("autogen", True)]),
1193
1227
  )
1194
- score = session.call(
1195
- score_sproc_name,
1228
+ score = score_wrapper_sproc(
1229
+ session,
1196
1230
  query,
1197
1231
  stage_score_file_name,
1198
1232
  identifier.get_unescaped_names(self.input_cols),
1199
1233
  identifier.get_unescaped_names(self.label_cols),
1200
1234
  identifier.get_unescaped_names(self.sample_weight_col),
1201
- statement_params=statement_params,
1235
+ statement_params,
1202
1236
  )
1203
1237
 
1204
1238
  cleanup_temp_files([local_score_file_name])
@@ -1216,18 +1250,20 @@ class RadiusNeighborsRegressor(BaseTransformer):
1216
1250
  if self._sklearn_object._estimator_type == 'classifier':
1217
1251
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1218
1252
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1219
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1253
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1254
+ ([] if self._drop_input_cols else inputs) + outputs)
1220
1255
  # For regressor, the type of predict is float64
1221
1256
  elif self._sklearn_object._estimator_type == 'regressor':
1222
1257
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1223
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1224
-
1258
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1259
+ ([] if self._drop_input_cols else inputs) + outputs)
1225
1260
  for prob_func in PROB_FUNCTIONS:
1226
1261
  if hasattr(self, prob_func):
1227
1262
  output_cols_prefix: str = f"{prob_func}_"
1228
1263
  output_column_names = self._get_output_column_names(output_cols_prefix)
1229
1264
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1230
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1265
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1266
+ ([] if self._drop_input_cols else inputs) + outputs)
1231
1267
 
1232
1268
  @property
1233
1269
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -216,7 +218,6 @@ class BernoulliRBM(BaseTransformer):
216
218
  sample_weight_col: Optional[str] = None,
217
219
  ) -> None:
218
220
  super().__init__()
219
- self.id = str(uuid4()).replace("-", "_").upper()
220
221
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
221
222
 
222
223
  self._deps = list(deps)
@@ -241,6 +242,15 @@ class BernoulliRBM(BaseTransformer):
241
242
  self.set_drop_input_cols(drop_input_cols)
242
243
  self.set_sample_weight_col(sample_weight_col)
243
244
 
245
+ def _get_rand_id(self) -> str:
246
+ """
247
+ Generate random id to be used in sproc and stage names.
248
+
249
+ Returns:
250
+ Random id string usable in sproc, table, and stage names.
251
+ """
252
+ return str(uuid4()).replace("-", "_").upper()
253
+
244
254
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
245
255
  """
246
256
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -319,7 +329,7 @@ class BernoulliRBM(BaseTransformer):
319
329
  cp.dump(self._sklearn_object, local_transform_file)
320
330
 
321
331
  # Create temp stage to run fit.
322
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
332
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
323
333
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
324
334
  SqlResultValidator(
325
335
  session=session,
@@ -332,11 +342,12 @@ class BernoulliRBM(BaseTransformer):
332
342
  expected_value=f"Stage area {transform_stage_name} successfully created."
333
343
  ).validate()
334
344
 
335
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
345
+ # Use posixpath to construct stage paths
346
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
347
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
336
348
  local_result_file_name = get_temp_file_path()
337
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
338
349
 
339
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
350
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
340
351
  statement_params = telemetry.get_function_usage_statement_params(
341
352
  project=_PROJECT,
342
353
  subproject=_SUBPROJECT,
@@ -362,6 +373,7 @@ class BernoulliRBM(BaseTransformer):
362
373
  replace=True,
363
374
  session=session,
364
375
  statement_params=statement_params,
376
+ anonymous=True
365
377
  )
366
378
  def fit_wrapper_sproc(
367
379
  session: Session,
@@ -370,7 +382,8 @@ class BernoulliRBM(BaseTransformer):
370
382
  stage_result_file_name: str,
371
383
  input_cols: List[str],
372
384
  label_cols: List[str],
373
- sample_weight_col: Optional[str]
385
+ sample_weight_col: Optional[str],
386
+ statement_params: Dict[str, str]
374
387
  ) -> str:
375
388
  import cloudpickle as cp
376
389
  import numpy as np
@@ -437,15 +450,15 @@ class BernoulliRBM(BaseTransformer):
437
450
  api_calls=[Session.call],
438
451
  custom_tags=dict([("autogen", True)]),
439
452
  )
440
- sproc_export_file_name = session.call(
441
- fit_sproc_name,
453
+ sproc_export_file_name = fit_wrapper_sproc(
454
+ session,
442
455
  query,
443
456
  stage_transform_file_name,
444
457
  stage_result_file_name,
445
458
  identifier.get_unescaped_names(self.input_cols),
446
459
  identifier.get_unescaped_names(self.label_cols),
447
460
  identifier.get_unescaped_names(self.sample_weight_col),
448
- statement_params=statement_params,
461
+ statement_params,
449
462
  )
450
463
 
451
464
  if "|" in sproc_export_file_name:
@@ -455,7 +468,7 @@ class BernoulliRBM(BaseTransformer):
455
468
  print("\n".join(fields[1:]))
456
469
 
457
470
  session.file.get(
458
- os.path.join(stage_result_file_name, sproc_export_file_name),
471
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
459
472
  local_result_file_name,
460
473
  statement_params=statement_params
461
474
  )
@@ -501,7 +514,7 @@ class BernoulliRBM(BaseTransformer):
501
514
 
502
515
  # Register vectorized UDF for batch inference
503
516
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
504
- safe_id=self.id, method=inference_method)
517
+ safe_id=self._get_rand_id(), method=inference_method)
505
518
 
506
519
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
507
520
  # will try to pickle all of self which fails.
@@ -593,7 +606,7 @@ class BernoulliRBM(BaseTransformer):
593
606
  return transformed_pandas_df.to_dict("records")
594
607
 
595
608
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
596
- safe_id=self.id
609
+ safe_id=self._get_rand_id()
597
610
  )
598
611
 
599
612
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -649,26 +662,37 @@ class BernoulliRBM(BaseTransformer):
649
662
  # input cols need to match unquoted / quoted
650
663
  input_cols = self.input_cols
651
664
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
665
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
652
666
 
653
667
  estimator = self._sklearn_object
654
668
 
655
- input_df = dataset[input_cols] # Select input columns with quoted column names.
656
- if hasattr(estimator, "feature_names_in_"):
657
- missing_features = []
658
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
659
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
660
- missing_features.append(f)
661
-
662
- if len(missing_features) > 0:
663
- raise ValueError(
664
- "The feature names should match with those that were passed during fit.\n"
665
- f"Features seen during fit call but not present in the input: {missing_features}\n"
666
- f"Features in the input dataframe : {input_cols}\n"
667
- )
668
- input_df.columns = getattr(estimator, "feature_names_in_")
669
- else:
670
- # Just rename the column names to unquoted identifiers.
671
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
669
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
670
+ missing_features = []
671
+ features_in_dataset = set(dataset.columns)
672
+ columns_to_select = []
673
+ for i, f in enumerate(features_required_by_estimator):
674
+ if (
675
+ i >= len(input_cols)
676
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
677
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
678
+ and quoted_input_cols[i] not in features_in_dataset)
679
+ ):
680
+ missing_features.append(f)
681
+ elif input_cols[i] in features_in_dataset:
682
+ columns_to_select.append(input_cols[i])
683
+ elif unquoted_input_cols[i] in features_in_dataset:
684
+ columns_to_select.append(unquoted_input_cols[i])
685
+ else:
686
+ columns_to_select.append(quoted_input_cols[i])
687
+
688
+ if len(missing_features) > 0:
689
+ raise ValueError(
690
+ "The feature names should match with those that were passed during fit.\n"
691
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
692
+ f"Features in the input dataframe : {input_cols}\n"
693
+ )
694
+ input_df = dataset[columns_to_select]
695
+ input_df.columns = features_required_by_estimator
672
696
 
673
697
  transformed_numpy_array = getattr(estimator, inference_method)(
674
698
  input_df
@@ -747,11 +771,18 @@ class BernoulliRBM(BaseTransformer):
747
771
  Transformed dataset.
748
772
  """
749
773
  if isinstance(dataset, DataFrame):
774
+ expected_type_inferred = ""
775
+ # when it is classifier, infer the datatype from label columns
776
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
777
+ expected_type_inferred = convert_sp_to_sf_type(
778
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
779
+ )
780
+
750
781
  output_df = self._batch_inference(
751
782
  dataset=dataset,
752
783
  inference_method="predict",
753
784
  expected_output_cols_list=self.output_cols,
754
- expected_output_cols_type="",
785
+ expected_output_cols_type=expected_type_inferred,
755
786
  )
756
787
  elif isinstance(dataset, pd.DataFrame):
757
788
  output_df = self._sklearn_inference(
@@ -824,10 +855,10 @@ class BernoulliRBM(BaseTransformer):
824
855
 
825
856
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
826
857
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
827
- Returns an empty list if current object is not a classifier or not yet fitted.
858
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
828
859
  """
829
860
  if getattr(self._sklearn_object, "classes_", None) is None:
830
- return []
861
+ return [output_cols_prefix]
831
862
 
832
863
  classes = self._sklearn_object.classes_
833
864
  if isinstance(classes, numpy.ndarray):
@@ -1052,7 +1083,7 @@ class BernoulliRBM(BaseTransformer):
1052
1083
  cp.dump(self._sklearn_object, local_score_file)
1053
1084
 
1054
1085
  # Create temp stage to run score.
1055
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1086
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1056
1087
  session = dataset._session
1057
1088
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1058
1089
  SqlResultValidator(
@@ -1066,8 +1097,9 @@ class BernoulliRBM(BaseTransformer):
1066
1097
  expected_value=f"Stage area {score_stage_name} successfully created."
1067
1098
  ).validate()
1068
1099
 
1069
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1070
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1100
+ # Use posixpath to construct stage paths
1101
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1102
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1071
1103
  statement_params = telemetry.get_function_usage_statement_params(
1072
1104
  project=_PROJECT,
1073
1105
  subproject=_SUBPROJECT,
@@ -1093,6 +1125,7 @@ class BernoulliRBM(BaseTransformer):
1093
1125
  replace=True,
1094
1126
  session=session,
1095
1127
  statement_params=statement_params,
1128
+ anonymous=True
1096
1129
  )
1097
1130
  def score_wrapper_sproc(
1098
1131
  session: Session,
@@ -1100,7 +1133,8 @@ class BernoulliRBM(BaseTransformer):
1100
1133
  stage_score_file_name: str,
1101
1134
  input_cols: List[str],
1102
1135
  label_cols: List[str],
1103
- sample_weight_col: Optional[str]
1136
+ sample_weight_col: Optional[str],
1137
+ statement_params: Dict[str, str]
1104
1138
  ) -> float:
1105
1139
  import cloudpickle as cp
1106
1140
  import numpy as np
@@ -1150,14 +1184,14 @@ class BernoulliRBM(BaseTransformer):
1150
1184
  api_calls=[Session.call],
1151
1185
  custom_tags=dict([("autogen", True)]),
1152
1186
  )
1153
- score = session.call(
1154
- score_sproc_name,
1187
+ score = score_wrapper_sproc(
1188
+ session,
1155
1189
  query,
1156
1190
  stage_score_file_name,
1157
1191
  identifier.get_unescaped_names(self.input_cols),
1158
1192
  identifier.get_unescaped_names(self.label_cols),
1159
1193
  identifier.get_unescaped_names(self.sample_weight_col),
1160
- statement_params=statement_params,
1194
+ statement_params,
1161
1195
  )
1162
1196
 
1163
1197
  cleanup_temp_files([local_score_file_name])
@@ -1175,18 +1209,20 @@ class BernoulliRBM(BaseTransformer):
1175
1209
  if self._sklearn_object._estimator_type == 'classifier':
1176
1210
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1177
1211
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1178
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1212
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1213
+ ([] if self._drop_input_cols else inputs) + outputs)
1179
1214
  # For regressor, the type of predict is float64
1180
1215
  elif self._sklearn_object._estimator_type == 'regressor':
1181
1216
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1182
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1183
-
1217
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1218
+ ([] if self._drop_input_cols else inputs) + outputs)
1184
1219
  for prob_func in PROB_FUNCTIONS:
1185
1220
  if hasattr(self, prob_func):
1186
1221
  output_cols_prefix: str = f"{prob_func}_"
1187
1222
  output_column_names = self._get_output_column_names(output_cols_prefix)
1188
1223
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1189
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1224
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1225
+ ([] if self._drop_input_cols else inputs) + outputs)
1190
1226
 
1191
1227
  @property
1192
1228
  def model_signatures(self) -> Dict[str, ModelSignature]: