snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -230,7 +232,6 @@ class ARDRegression(BaseTransformer):
230
232
  sample_weight_col: Optional[str] = None,
231
233
  ) -> None:
232
234
  super().__init__()
233
- self.id = str(uuid4()).replace("-", "_").upper()
234
235
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
235
236
 
236
237
  self._deps = list(deps)
@@ -260,6 +261,15 @@ class ARDRegression(BaseTransformer):
260
261
  self.set_drop_input_cols(drop_input_cols)
261
262
  self.set_sample_weight_col(sample_weight_col)
262
263
 
264
+ def _get_rand_id(self) -> str:
265
+ """
266
+ Generate random id to be used in sproc and stage names.
267
+
268
+ Returns:
269
+ Random id string usable in sproc, table, and stage names.
270
+ """
271
+ return str(uuid4()).replace("-", "_").upper()
272
+
263
273
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
264
274
  """
265
275
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -338,7 +348,7 @@ class ARDRegression(BaseTransformer):
338
348
  cp.dump(self._sklearn_object, local_transform_file)
339
349
 
340
350
  # Create temp stage to run fit.
341
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
351
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
342
352
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
343
353
  SqlResultValidator(
344
354
  session=session,
@@ -351,11 +361,12 @@ class ARDRegression(BaseTransformer):
351
361
  expected_value=f"Stage area {transform_stage_name} successfully created."
352
362
  ).validate()
353
363
 
354
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
364
+ # Use posixpath to construct stage paths
365
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
366
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
355
367
  local_result_file_name = get_temp_file_path()
356
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
357
368
 
358
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
369
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
359
370
  statement_params = telemetry.get_function_usage_statement_params(
360
371
  project=_PROJECT,
361
372
  subproject=_SUBPROJECT,
@@ -381,6 +392,7 @@ class ARDRegression(BaseTransformer):
381
392
  replace=True,
382
393
  session=session,
383
394
  statement_params=statement_params,
395
+ anonymous=True
384
396
  )
385
397
  def fit_wrapper_sproc(
386
398
  session: Session,
@@ -389,7 +401,8 @@ class ARDRegression(BaseTransformer):
389
401
  stage_result_file_name: str,
390
402
  input_cols: List[str],
391
403
  label_cols: List[str],
392
- sample_weight_col: Optional[str]
404
+ sample_weight_col: Optional[str],
405
+ statement_params: Dict[str, str]
393
406
  ) -> str:
394
407
  import cloudpickle as cp
395
408
  import numpy as np
@@ -456,15 +469,15 @@ class ARDRegression(BaseTransformer):
456
469
  api_calls=[Session.call],
457
470
  custom_tags=dict([("autogen", True)]),
458
471
  )
459
- sproc_export_file_name = session.call(
460
- fit_sproc_name,
472
+ sproc_export_file_name = fit_wrapper_sproc(
473
+ session,
461
474
  query,
462
475
  stage_transform_file_name,
463
476
  stage_result_file_name,
464
477
  identifier.get_unescaped_names(self.input_cols),
465
478
  identifier.get_unescaped_names(self.label_cols),
466
479
  identifier.get_unescaped_names(self.sample_weight_col),
467
- statement_params=statement_params,
480
+ statement_params,
468
481
  )
469
482
 
470
483
  if "|" in sproc_export_file_name:
@@ -474,7 +487,7 @@ class ARDRegression(BaseTransformer):
474
487
  print("\n".join(fields[1:]))
475
488
 
476
489
  session.file.get(
477
- os.path.join(stage_result_file_name, sproc_export_file_name),
490
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
478
491
  local_result_file_name,
479
492
  statement_params=statement_params
480
493
  )
@@ -520,7 +533,7 @@ class ARDRegression(BaseTransformer):
520
533
 
521
534
  # Register vectorized UDF for batch inference
522
535
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
523
- safe_id=self.id, method=inference_method)
536
+ safe_id=self._get_rand_id(), method=inference_method)
524
537
 
525
538
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
526
539
  # will try to pickle all of self which fails.
@@ -612,7 +625,7 @@ class ARDRegression(BaseTransformer):
612
625
  return transformed_pandas_df.to_dict("records")
613
626
 
614
627
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
615
- safe_id=self.id
628
+ safe_id=self._get_rand_id()
616
629
  )
617
630
 
618
631
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -668,26 +681,37 @@ class ARDRegression(BaseTransformer):
668
681
  # input cols need to match unquoted / quoted
669
682
  input_cols = self.input_cols
670
683
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
684
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
671
685
 
672
686
  estimator = self._sklearn_object
673
687
 
674
- input_df = dataset[input_cols] # Select input columns with quoted column names.
675
- if hasattr(estimator, "feature_names_in_"):
676
- missing_features = []
677
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
678
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
679
- missing_features.append(f)
680
-
681
- if len(missing_features) > 0:
682
- raise ValueError(
683
- "The feature names should match with those that were passed during fit.\n"
684
- f"Features seen during fit call but not present in the input: {missing_features}\n"
685
- f"Features in the input dataframe : {input_cols}\n"
686
- )
687
- input_df.columns = getattr(estimator, "feature_names_in_")
688
- else:
689
- # Just rename the column names to unquoted identifiers.
690
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
688
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
689
+ missing_features = []
690
+ features_in_dataset = set(dataset.columns)
691
+ columns_to_select = []
692
+ for i, f in enumerate(features_required_by_estimator):
693
+ if (
694
+ i >= len(input_cols)
695
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
696
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
697
+ and quoted_input_cols[i] not in features_in_dataset)
698
+ ):
699
+ missing_features.append(f)
700
+ elif input_cols[i] in features_in_dataset:
701
+ columns_to_select.append(input_cols[i])
702
+ elif unquoted_input_cols[i] in features_in_dataset:
703
+ columns_to_select.append(unquoted_input_cols[i])
704
+ else:
705
+ columns_to_select.append(quoted_input_cols[i])
706
+
707
+ if len(missing_features) > 0:
708
+ raise ValueError(
709
+ "The feature names should match with those that were passed during fit.\n"
710
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
711
+ f"Features in the input dataframe : {input_cols}\n"
712
+ )
713
+ input_df = dataset[columns_to_select]
714
+ input_df.columns = features_required_by_estimator
691
715
 
692
716
  transformed_numpy_array = getattr(estimator, inference_method)(
693
717
  input_df
@@ -768,11 +792,18 @@ class ARDRegression(BaseTransformer):
768
792
  Transformed dataset.
769
793
  """
770
794
  if isinstance(dataset, DataFrame):
795
+ expected_type_inferred = "float"
796
+ # when it is classifier, infer the datatype from label columns
797
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
798
+ expected_type_inferred = convert_sp_to_sf_type(
799
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
800
+ )
801
+
771
802
  output_df = self._batch_inference(
772
803
  dataset=dataset,
773
804
  inference_method="predict",
774
805
  expected_output_cols_list=self.output_cols,
775
- expected_output_cols_type="float",
806
+ expected_output_cols_type=expected_type_inferred,
776
807
  )
777
808
  elif isinstance(dataset, pd.DataFrame):
778
809
  output_df = self._sklearn_inference(
@@ -843,10 +874,10 @@ class ARDRegression(BaseTransformer):
843
874
 
844
875
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
845
876
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
846
- Returns an empty list if current object is not a classifier or not yet fitted.
877
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
847
878
  """
848
879
  if getattr(self._sklearn_object, "classes_", None) is None:
849
- return []
880
+ return [output_cols_prefix]
850
881
 
851
882
  classes = self._sklearn_object.classes_
852
883
  if isinstance(classes, numpy.ndarray):
@@ -1071,7 +1102,7 @@ class ARDRegression(BaseTransformer):
1071
1102
  cp.dump(self._sklearn_object, local_score_file)
1072
1103
 
1073
1104
  # Create temp stage to run score.
1074
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1105
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1075
1106
  session = dataset._session
1076
1107
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1077
1108
  SqlResultValidator(
@@ -1085,8 +1116,9 @@ class ARDRegression(BaseTransformer):
1085
1116
  expected_value=f"Stage area {score_stage_name} successfully created."
1086
1117
  ).validate()
1087
1118
 
1088
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1089
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1119
+ # Use posixpath to construct stage paths
1120
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1121
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1090
1122
  statement_params = telemetry.get_function_usage_statement_params(
1091
1123
  project=_PROJECT,
1092
1124
  subproject=_SUBPROJECT,
@@ -1112,6 +1144,7 @@ class ARDRegression(BaseTransformer):
1112
1144
  replace=True,
1113
1145
  session=session,
1114
1146
  statement_params=statement_params,
1147
+ anonymous=True
1115
1148
  )
1116
1149
  def score_wrapper_sproc(
1117
1150
  session: Session,
@@ -1119,7 +1152,8 @@ class ARDRegression(BaseTransformer):
1119
1152
  stage_score_file_name: str,
1120
1153
  input_cols: List[str],
1121
1154
  label_cols: List[str],
1122
- sample_weight_col: Optional[str]
1155
+ sample_weight_col: Optional[str],
1156
+ statement_params: Dict[str, str]
1123
1157
  ) -> float:
1124
1158
  import cloudpickle as cp
1125
1159
  import numpy as np
@@ -1169,14 +1203,14 @@ class ARDRegression(BaseTransformer):
1169
1203
  api_calls=[Session.call],
1170
1204
  custom_tags=dict([("autogen", True)]),
1171
1205
  )
1172
- score = session.call(
1173
- score_sproc_name,
1206
+ score = score_wrapper_sproc(
1207
+ session,
1174
1208
  query,
1175
1209
  stage_score_file_name,
1176
1210
  identifier.get_unescaped_names(self.input_cols),
1177
1211
  identifier.get_unescaped_names(self.label_cols),
1178
1212
  identifier.get_unescaped_names(self.sample_weight_col),
1179
- statement_params=statement_params,
1213
+ statement_params,
1180
1214
  )
1181
1215
 
1182
1216
  cleanup_temp_files([local_score_file_name])
@@ -1194,18 +1228,20 @@ class ARDRegression(BaseTransformer):
1194
1228
  if self._sklearn_object._estimator_type == 'classifier':
1195
1229
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1196
1230
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1197
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1231
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1232
+ ([] if self._drop_input_cols else inputs) + outputs)
1198
1233
  # For regressor, the type of predict is float64
1199
1234
  elif self._sklearn_object._estimator_type == 'regressor':
1200
1235
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1201
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1202
-
1236
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1237
+ ([] if self._drop_input_cols else inputs) + outputs)
1203
1238
  for prob_func in PROB_FUNCTIONS:
1204
1239
  if hasattr(self, prob_func):
1205
1240
  output_cols_prefix: str = f"{prob_func}_"
1206
1241
  output_column_names = self._get_output_column_names(output_cols_prefix)
1207
1242
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1208
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1243
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1244
+ ([] if self._drop_input_cols else inputs) + outputs)
1209
1245
 
1210
1246
  @property
1211
1247
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -238,7 +240,6 @@ class BayesianRidge(BaseTransformer):
238
240
  sample_weight_col: Optional[str] = None,
239
241
  ) -> None:
240
242
  super().__init__()
241
- self.id = str(uuid4()).replace("-", "_").upper()
242
243
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
243
244
 
244
245
  self._deps = list(deps)
@@ -269,6 +270,15 @@ class BayesianRidge(BaseTransformer):
269
270
  self.set_drop_input_cols(drop_input_cols)
270
271
  self.set_sample_weight_col(sample_weight_col)
271
272
 
273
+ def _get_rand_id(self) -> str:
274
+ """
275
+ Generate random id to be used in sproc and stage names.
276
+
277
+ Returns:
278
+ Random id string usable in sproc, table, and stage names.
279
+ """
280
+ return str(uuid4()).replace("-", "_").upper()
281
+
272
282
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
273
283
  """
274
284
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -347,7 +357,7 @@ class BayesianRidge(BaseTransformer):
347
357
  cp.dump(self._sklearn_object, local_transform_file)
348
358
 
349
359
  # Create temp stage to run fit.
350
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
360
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
351
361
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
352
362
  SqlResultValidator(
353
363
  session=session,
@@ -360,11 +370,12 @@ class BayesianRidge(BaseTransformer):
360
370
  expected_value=f"Stage area {transform_stage_name} successfully created."
361
371
  ).validate()
362
372
 
363
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
373
+ # Use posixpath to construct stage paths
374
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
375
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
364
376
  local_result_file_name = get_temp_file_path()
365
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
366
377
 
367
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
378
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
368
379
  statement_params = telemetry.get_function_usage_statement_params(
369
380
  project=_PROJECT,
370
381
  subproject=_SUBPROJECT,
@@ -390,6 +401,7 @@ class BayesianRidge(BaseTransformer):
390
401
  replace=True,
391
402
  session=session,
392
403
  statement_params=statement_params,
404
+ anonymous=True
393
405
  )
394
406
  def fit_wrapper_sproc(
395
407
  session: Session,
@@ -398,7 +410,8 @@ class BayesianRidge(BaseTransformer):
398
410
  stage_result_file_name: str,
399
411
  input_cols: List[str],
400
412
  label_cols: List[str],
401
- sample_weight_col: Optional[str]
413
+ sample_weight_col: Optional[str],
414
+ statement_params: Dict[str, str]
402
415
  ) -> str:
403
416
  import cloudpickle as cp
404
417
  import numpy as np
@@ -465,15 +478,15 @@ class BayesianRidge(BaseTransformer):
465
478
  api_calls=[Session.call],
466
479
  custom_tags=dict([("autogen", True)]),
467
480
  )
468
- sproc_export_file_name = session.call(
469
- fit_sproc_name,
481
+ sproc_export_file_name = fit_wrapper_sproc(
482
+ session,
470
483
  query,
471
484
  stage_transform_file_name,
472
485
  stage_result_file_name,
473
486
  identifier.get_unescaped_names(self.input_cols),
474
487
  identifier.get_unescaped_names(self.label_cols),
475
488
  identifier.get_unescaped_names(self.sample_weight_col),
476
- statement_params=statement_params,
489
+ statement_params,
477
490
  )
478
491
 
479
492
  if "|" in sproc_export_file_name:
@@ -483,7 +496,7 @@ class BayesianRidge(BaseTransformer):
483
496
  print("\n".join(fields[1:]))
484
497
 
485
498
  session.file.get(
486
- os.path.join(stage_result_file_name, sproc_export_file_name),
499
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
487
500
  local_result_file_name,
488
501
  statement_params=statement_params
489
502
  )
@@ -529,7 +542,7 @@ class BayesianRidge(BaseTransformer):
529
542
 
530
543
  # Register vectorized UDF for batch inference
531
544
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
532
- safe_id=self.id, method=inference_method)
545
+ safe_id=self._get_rand_id(), method=inference_method)
533
546
 
534
547
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
535
548
  # will try to pickle all of self which fails.
@@ -621,7 +634,7 @@ class BayesianRidge(BaseTransformer):
621
634
  return transformed_pandas_df.to_dict("records")
622
635
 
623
636
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
624
- safe_id=self.id
637
+ safe_id=self._get_rand_id()
625
638
  )
626
639
 
627
640
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -677,26 +690,37 @@ class BayesianRidge(BaseTransformer):
677
690
  # input cols need to match unquoted / quoted
678
691
  input_cols = self.input_cols
679
692
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
693
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
680
694
 
681
695
  estimator = self._sklearn_object
682
696
 
683
- input_df = dataset[input_cols] # Select input columns with quoted column names.
684
- if hasattr(estimator, "feature_names_in_"):
685
- missing_features = []
686
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
687
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
688
- missing_features.append(f)
689
-
690
- if len(missing_features) > 0:
691
- raise ValueError(
692
- "The feature names should match with those that were passed during fit.\n"
693
- f"Features seen during fit call but not present in the input: {missing_features}\n"
694
- f"Features in the input dataframe : {input_cols}\n"
695
- )
696
- input_df.columns = getattr(estimator, "feature_names_in_")
697
- else:
698
- # Just rename the column names to unquoted identifiers.
699
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
697
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
698
+ missing_features = []
699
+ features_in_dataset = set(dataset.columns)
700
+ columns_to_select = []
701
+ for i, f in enumerate(features_required_by_estimator):
702
+ if (
703
+ i >= len(input_cols)
704
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
705
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
706
+ and quoted_input_cols[i] not in features_in_dataset)
707
+ ):
708
+ missing_features.append(f)
709
+ elif input_cols[i] in features_in_dataset:
710
+ columns_to_select.append(input_cols[i])
711
+ elif unquoted_input_cols[i] in features_in_dataset:
712
+ columns_to_select.append(unquoted_input_cols[i])
713
+ else:
714
+ columns_to_select.append(quoted_input_cols[i])
715
+
716
+ if len(missing_features) > 0:
717
+ raise ValueError(
718
+ "The feature names should match with those that were passed during fit.\n"
719
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
720
+ f"Features in the input dataframe : {input_cols}\n"
721
+ )
722
+ input_df = dataset[columns_to_select]
723
+ input_df.columns = features_required_by_estimator
700
724
 
701
725
  transformed_numpy_array = getattr(estimator, inference_method)(
702
726
  input_df
@@ -777,11 +801,18 @@ class BayesianRidge(BaseTransformer):
777
801
  Transformed dataset.
778
802
  """
779
803
  if isinstance(dataset, DataFrame):
804
+ expected_type_inferred = "float"
805
+ # when it is classifier, infer the datatype from label columns
806
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
807
+ expected_type_inferred = convert_sp_to_sf_type(
808
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
809
+ )
810
+
780
811
  output_df = self._batch_inference(
781
812
  dataset=dataset,
782
813
  inference_method="predict",
783
814
  expected_output_cols_list=self.output_cols,
784
- expected_output_cols_type="float",
815
+ expected_output_cols_type=expected_type_inferred,
785
816
  )
786
817
  elif isinstance(dataset, pd.DataFrame):
787
818
  output_df = self._sklearn_inference(
@@ -852,10 +883,10 @@ class BayesianRidge(BaseTransformer):
852
883
 
853
884
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
854
885
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
855
- Returns an empty list if current object is not a classifier or not yet fitted.
886
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
856
887
  """
857
888
  if getattr(self._sklearn_object, "classes_", None) is None:
858
- return []
889
+ return [output_cols_prefix]
859
890
 
860
891
  classes = self._sklearn_object.classes_
861
892
  if isinstance(classes, numpy.ndarray):
@@ -1080,7 +1111,7 @@ class BayesianRidge(BaseTransformer):
1080
1111
  cp.dump(self._sklearn_object, local_score_file)
1081
1112
 
1082
1113
  # Create temp stage to run score.
1083
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1114
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1084
1115
  session = dataset._session
1085
1116
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1086
1117
  SqlResultValidator(
@@ -1094,8 +1125,9 @@ class BayesianRidge(BaseTransformer):
1094
1125
  expected_value=f"Stage area {score_stage_name} successfully created."
1095
1126
  ).validate()
1096
1127
 
1097
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1098
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1128
+ # Use posixpath to construct stage paths
1129
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1130
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1099
1131
  statement_params = telemetry.get_function_usage_statement_params(
1100
1132
  project=_PROJECT,
1101
1133
  subproject=_SUBPROJECT,
@@ -1121,6 +1153,7 @@ class BayesianRidge(BaseTransformer):
1121
1153
  replace=True,
1122
1154
  session=session,
1123
1155
  statement_params=statement_params,
1156
+ anonymous=True
1124
1157
  )
1125
1158
  def score_wrapper_sproc(
1126
1159
  session: Session,
@@ -1128,7 +1161,8 @@ class BayesianRidge(BaseTransformer):
1128
1161
  stage_score_file_name: str,
1129
1162
  input_cols: List[str],
1130
1163
  label_cols: List[str],
1131
- sample_weight_col: Optional[str]
1164
+ sample_weight_col: Optional[str],
1165
+ statement_params: Dict[str, str]
1132
1166
  ) -> float:
1133
1167
  import cloudpickle as cp
1134
1168
  import numpy as np
@@ -1178,14 +1212,14 @@ class BayesianRidge(BaseTransformer):
1178
1212
  api_calls=[Session.call],
1179
1213
  custom_tags=dict([("autogen", True)]),
1180
1214
  )
1181
- score = session.call(
1182
- score_sproc_name,
1215
+ score = score_wrapper_sproc(
1216
+ session,
1183
1217
  query,
1184
1218
  stage_score_file_name,
1185
1219
  identifier.get_unescaped_names(self.input_cols),
1186
1220
  identifier.get_unescaped_names(self.label_cols),
1187
1221
  identifier.get_unescaped_names(self.sample_weight_col),
1188
- statement_params=statement_params,
1222
+ statement_params,
1189
1223
  )
1190
1224
 
1191
1225
  cleanup_temp_files([local_score_file_name])
@@ -1203,18 +1237,20 @@ class BayesianRidge(BaseTransformer):
1203
1237
  if self._sklearn_object._estimator_type == 'classifier':
1204
1238
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1205
1239
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1206
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1240
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1241
+ ([] if self._drop_input_cols else inputs) + outputs)
1207
1242
  # For regressor, the type of predict is float64
1208
1243
  elif self._sklearn_object._estimator_type == 'regressor':
1209
1244
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1210
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1211
-
1245
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1246
+ ([] if self._drop_input_cols else inputs) + outputs)
1212
1247
  for prob_func in PROB_FUNCTIONS:
1213
1248
  if hasattr(self, prob_func):
1214
1249
  output_cols_prefix: str = f"{prob_func}_"
1215
1250
  output_column_names = self._get_output_column_names(output_cols_prefix)
1216
1251
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1217
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1252
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1253
+ ([] if self._drop_input_cols else inputs) + outputs)
1218
1254
 
1219
1255
  @property
1220
1256
  def model_signatures(self) -> Dict[str, ModelSignature]: