snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -241,7 +243,6 @@ class SpectralBiclustering(BaseTransformer):
241
243
  sample_weight_col: Optional[str] = None,
242
244
  ) -> None:
243
245
  super().__init__()
244
- self.id = str(uuid4()).replace("-", "_").upper()
245
246
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
246
247
 
247
248
  self._deps = list(deps)
@@ -270,6 +271,15 @@ class SpectralBiclustering(BaseTransformer):
270
271
  self.set_drop_input_cols(drop_input_cols)
271
272
  self.set_sample_weight_col(sample_weight_col)
272
273
 
274
+ def _get_rand_id(self) -> str:
275
+ """
276
+ Generate random id to be used in sproc and stage names.
277
+
278
+ Returns:
279
+ Random id string usable in sproc, table, and stage names.
280
+ """
281
+ return str(uuid4()).replace("-", "_").upper()
282
+
273
283
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
274
284
  """
275
285
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -348,7 +358,7 @@ class SpectralBiclustering(BaseTransformer):
348
358
  cp.dump(self._sklearn_object, local_transform_file)
349
359
 
350
360
  # Create temp stage to run fit.
351
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
361
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
352
362
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
353
363
  SqlResultValidator(
354
364
  session=session,
@@ -361,11 +371,12 @@ class SpectralBiclustering(BaseTransformer):
361
371
  expected_value=f"Stage area {transform_stage_name} successfully created."
362
372
  ).validate()
363
373
 
364
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
374
+ # Use posixpath to construct stage paths
375
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
376
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
365
377
  local_result_file_name = get_temp_file_path()
366
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
367
378
 
368
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
379
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
369
380
  statement_params = telemetry.get_function_usage_statement_params(
370
381
  project=_PROJECT,
371
382
  subproject=_SUBPROJECT,
@@ -391,6 +402,7 @@ class SpectralBiclustering(BaseTransformer):
391
402
  replace=True,
392
403
  session=session,
393
404
  statement_params=statement_params,
405
+ anonymous=True
394
406
  )
395
407
  def fit_wrapper_sproc(
396
408
  session: Session,
@@ -399,7 +411,8 @@ class SpectralBiclustering(BaseTransformer):
399
411
  stage_result_file_name: str,
400
412
  input_cols: List[str],
401
413
  label_cols: List[str],
402
- sample_weight_col: Optional[str]
414
+ sample_weight_col: Optional[str],
415
+ statement_params: Dict[str, str]
403
416
  ) -> str:
404
417
  import cloudpickle as cp
405
418
  import numpy as np
@@ -466,15 +479,15 @@ class SpectralBiclustering(BaseTransformer):
466
479
  api_calls=[Session.call],
467
480
  custom_tags=dict([("autogen", True)]),
468
481
  )
469
- sproc_export_file_name = session.call(
470
- fit_sproc_name,
482
+ sproc_export_file_name = fit_wrapper_sproc(
483
+ session,
471
484
  query,
472
485
  stage_transform_file_name,
473
486
  stage_result_file_name,
474
487
  identifier.get_unescaped_names(self.input_cols),
475
488
  identifier.get_unescaped_names(self.label_cols),
476
489
  identifier.get_unescaped_names(self.sample_weight_col),
477
- statement_params=statement_params,
490
+ statement_params,
478
491
  )
479
492
 
480
493
  if "|" in sproc_export_file_name:
@@ -484,7 +497,7 @@ class SpectralBiclustering(BaseTransformer):
484
497
  print("\n".join(fields[1:]))
485
498
 
486
499
  session.file.get(
487
- os.path.join(stage_result_file_name, sproc_export_file_name),
500
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
488
501
  local_result_file_name,
489
502
  statement_params=statement_params
490
503
  )
@@ -530,7 +543,7 @@ class SpectralBiclustering(BaseTransformer):
530
543
 
531
544
  # Register vectorized UDF for batch inference
532
545
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
533
- safe_id=self.id, method=inference_method)
546
+ safe_id=self._get_rand_id(), method=inference_method)
534
547
 
535
548
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
536
549
  # will try to pickle all of self which fails.
@@ -622,7 +635,7 @@ class SpectralBiclustering(BaseTransformer):
622
635
  return transformed_pandas_df.to_dict("records")
623
636
 
624
637
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
625
- safe_id=self.id
638
+ safe_id=self._get_rand_id()
626
639
  )
627
640
 
628
641
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -678,26 +691,37 @@ class SpectralBiclustering(BaseTransformer):
678
691
  # input cols need to match unquoted / quoted
679
692
  input_cols = self.input_cols
680
693
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
694
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
681
695
 
682
696
  estimator = self._sklearn_object
683
697
 
684
- input_df = dataset[input_cols] # Select input columns with quoted column names.
685
- if hasattr(estimator, "feature_names_in_"):
686
- missing_features = []
687
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
688
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
689
- missing_features.append(f)
690
-
691
- if len(missing_features) > 0:
692
- raise ValueError(
693
- "The feature names should match with those that were passed during fit.\n"
694
- f"Features seen during fit call but not present in the input: {missing_features}\n"
695
- f"Features in the input dataframe : {input_cols}\n"
696
- )
697
- input_df.columns = getattr(estimator, "feature_names_in_")
698
- else:
699
- # Just rename the column names to unquoted identifiers.
700
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
698
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
699
+ missing_features = []
700
+ features_in_dataset = set(dataset.columns)
701
+ columns_to_select = []
702
+ for i, f in enumerate(features_required_by_estimator):
703
+ if (
704
+ i >= len(input_cols)
705
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
706
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
707
+ and quoted_input_cols[i] not in features_in_dataset)
708
+ ):
709
+ missing_features.append(f)
710
+ elif input_cols[i] in features_in_dataset:
711
+ columns_to_select.append(input_cols[i])
712
+ elif unquoted_input_cols[i] in features_in_dataset:
713
+ columns_to_select.append(unquoted_input_cols[i])
714
+ else:
715
+ columns_to_select.append(quoted_input_cols[i])
716
+
717
+ if len(missing_features) > 0:
718
+ raise ValueError(
719
+ "The feature names should match with those that were passed during fit.\n"
720
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
721
+ f"Features in the input dataframe : {input_cols}\n"
722
+ )
723
+ input_df = dataset[columns_to_select]
724
+ input_df.columns = features_required_by_estimator
701
725
 
702
726
  transformed_numpy_array = getattr(estimator, inference_method)(
703
727
  input_df
@@ -776,11 +800,18 @@ class SpectralBiclustering(BaseTransformer):
776
800
  Transformed dataset.
777
801
  """
778
802
  if isinstance(dataset, DataFrame):
803
+ expected_type_inferred = ""
804
+ # when it is classifier, infer the datatype from label columns
805
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
806
+ expected_type_inferred = convert_sp_to_sf_type(
807
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
808
+ )
809
+
779
810
  output_df = self._batch_inference(
780
811
  dataset=dataset,
781
812
  inference_method="predict",
782
813
  expected_output_cols_list=self.output_cols,
783
- expected_output_cols_type="",
814
+ expected_output_cols_type=expected_type_inferred,
784
815
  )
785
816
  elif isinstance(dataset, pd.DataFrame):
786
817
  output_df = self._sklearn_inference(
@@ -851,10 +882,10 @@ class SpectralBiclustering(BaseTransformer):
851
882
 
852
883
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
853
884
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
854
- Returns an empty list if current object is not a classifier or not yet fitted.
885
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
855
886
  """
856
887
  if getattr(self._sklearn_object, "classes_", None) is None:
857
- return []
888
+ return [output_cols_prefix]
858
889
 
859
890
  classes = self._sklearn_object.classes_
860
891
  if isinstance(classes, numpy.ndarray):
@@ -1079,7 +1110,7 @@ class SpectralBiclustering(BaseTransformer):
1079
1110
  cp.dump(self._sklearn_object, local_score_file)
1080
1111
 
1081
1112
  # Create temp stage to run score.
1082
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1113
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1083
1114
  session = dataset._session
1084
1115
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1085
1116
  SqlResultValidator(
@@ -1093,8 +1124,9 @@ class SpectralBiclustering(BaseTransformer):
1093
1124
  expected_value=f"Stage area {score_stage_name} successfully created."
1094
1125
  ).validate()
1095
1126
 
1096
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1097
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1127
+ # Use posixpath to construct stage paths
1128
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1129
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1098
1130
  statement_params = telemetry.get_function_usage_statement_params(
1099
1131
  project=_PROJECT,
1100
1132
  subproject=_SUBPROJECT,
@@ -1120,6 +1152,7 @@ class SpectralBiclustering(BaseTransformer):
1120
1152
  replace=True,
1121
1153
  session=session,
1122
1154
  statement_params=statement_params,
1155
+ anonymous=True
1123
1156
  )
1124
1157
  def score_wrapper_sproc(
1125
1158
  session: Session,
@@ -1127,7 +1160,8 @@ class SpectralBiclustering(BaseTransformer):
1127
1160
  stage_score_file_name: str,
1128
1161
  input_cols: List[str],
1129
1162
  label_cols: List[str],
1130
- sample_weight_col: Optional[str]
1163
+ sample_weight_col: Optional[str],
1164
+ statement_params: Dict[str, str]
1131
1165
  ) -> float:
1132
1166
  import cloudpickle as cp
1133
1167
  import numpy as np
@@ -1177,14 +1211,14 @@ class SpectralBiclustering(BaseTransformer):
1177
1211
  api_calls=[Session.call],
1178
1212
  custom_tags=dict([("autogen", True)]),
1179
1213
  )
1180
- score = session.call(
1181
- score_sproc_name,
1214
+ score = score_wrapper_sproc(
1215
+ session,
1182
1216
  query,
1183
1217
  stage_score_file_name,
1184
1218
  identifier.get_unescaped_names(self.input_cols),
1185
1219
  identifier.get_unescaped_names(self.label_cols),
1186
1220
  identifier.get_unescaped_names(self.sample_weight_col),
1187
- statement_params=statement_params,
1221
+ statement_params,
1188
1222
  )
1189
1223
 
1190
1224
  cleanup_temp_files([local_score_file_name])
@@ -1202,18 +1236,20 @@ class SpectralBiclustering(BaseTransformer):
1202
1236
  if self._sklearn_object._estimator_type == 'classifier':
1203
1237
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1204
1238
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1205
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1239
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1240
+ ([] if self._drop_input_cols else inputs) + outputs)
1206
1241
  # For regressor, the type of predict is float64
1207
1242
  elif self._sklearn_object._estimator_type == 'regressor':
1208
1243
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1209
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1210
-
1244
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1245
+ ([] if self._drop_input_cols else inputs) + outputs)
1211
1246
  for prob_func in PROB_FUNCTIONS:
1212
1247
  if hasattr(self, prob_func):
1213
1248
  output_cols_prefix: str = f"{prob_func}_"
1214
1249
  output_column_names = self._get_output_column_names(output_cols_prefix)
1215
1250
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1216
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1251
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1252
+ ([] if self._drop_input_cols else inputs) + outputs)
1217
1253
 
1218
1254
  @property
1219
1255
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -294,7 +296,6 @@ class SpectralClustering(BaseTransformer):
294
296
  sample_weight_col: Optional[str] = None,
295
297
  ) -> None:
296
298
  super().__init__()
297
- self.id = str(uuid4()).replace("-", "_").upper()
298
299
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
299
300
 
300
301
  self._deps = list(deps)
@@ -328,6 +329,15 @@ class SpectralClustering(BaseTransformer):
328
329
  self.set_drop_input_cols(drop_input_cols)
329
330
  self.set_sample_weight_col(sample_weight_col)
330
331
 
332
+ def _get_rand_id(self) -> str:
333
+ """
334
+ Generate random id to be used in sproc and stage names.
335
+
336
+ Returns:
337
+ Random id string usable in sproc, table, and stage names.
338
+ """
339
+ return str(uuid4()).replace("-", "_").upper()
340
+
331
341
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
332
342
  """
333
343
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -406,7 +416,7 @@ class SpectralClustering(BaseTransformer):
406
416
  cp.dump(self._sklearn_object, local_transform_file)
407
417
 
408
418
  # Create temp stage to run fit.
409
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
419
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
410
420
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
411
421
  SqlResultValidator(
412
422
  session=session,
@@ -419,11 +429,12 @@ class SpectralClustering(BaseTransformer):
419
429
  expected_value=f"Stage area {transform_stage_name} successfully created."
420
430
  ).validate()
421
431
 
422
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
432
+ # Use posixpath to construct stage paths
433
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
434
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
423
435
  local_result_file_name = get_temp_file_path()
424
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
425
436
 
426
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
437
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
427
438
  statement_params = telemetry.get_function_usage_statement_params(
428
439
  project=_PROJECT,
429
440
  subproject=_SUBPROJECT,
@@ -449,6 +460,7 @@ class SpectralClustering(BaseTransformer):
449
460
  replace=True,
450
461
  session=session,
451
462
  statement_params=statement_params,
463
+ anonymous=True
452
464
  )
453
465
  def fit_wrapper_sproc(
454
466
  session: Session,
@@ -457,7 +469,8 @@ class SpectralClustering(BaseTransformer):
457
469
  stage_result_file_name: str,
458
470
  input_cols: List[str],
459
471
  label_cols: List[str],
460
- sample_weight_col: Optional[str]
472
+ sample_weight_col: Optional[str],
473
+ statement_params: Dict[str, str]
461
474
  ) -> str:
462
475
  import cloudpickle as cp
463
476
  import numpy as np
@@ -524,15 +537,15 @@ class SpectralClustering(BaseTransformer):
524
537
  api_calls=[Session.call],
525
538
  custom_tags=dict([("autogen", True)]),
526
539
  )
527
- sproc_export_file_name = session.call(
528
- fit_sproc_name,
540
+ sproc_export_file_name = fit_wrapper_sproc(
541
+ session,
529
542
  query,
530
543
  stage_transform_file_name,
531
544
  stage_result_file_name,
532
545
  identifier.get_unescaped_names(self.input_cols),
533
546
  identifier.get_unescaped_names(self.label_cols),
534
547
  identifier.get_unescaped_names(self.sample_weight_col),
535
- statement_params=statement_params,
548
+ statement_params,
536
549
  )
537
550
 
538
551
  if "|" in sproc_export_file_name:
@@ -542,7 +555,7 @@ class SpectralClustering(BaseTransformer):
542
555
  print("\n".join(fields[1:]))
543
556
 
544
557
  session.file.get(
545
- os.path.join(stage_result_file_name, sproc_export_file_name),
558
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
546
559
  local_result_file_name,
547
560
  statement_params=statement_params
548
561
  )
@@ -588,7 +601,7 @@ class SpectralClustering(BaseTransformer):
588
601
 
589
602
  # Register vectorized UDF for batch inference
590
603
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
591
- safe_id=self.id, method=inference_method)
604
+ safe_id=self._get_rand_id(), method=inference_method)
592
605
 
593
606
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
594
607
  # will try to pickle all of self which fails.
@@ -680,7 +693,7 @@ class SpectralClustering(BaseTransformer):
680
693
  return transformed_pandas_df.to_dict("records")
681
694
 
682
695
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
683
- safe_id=self.id
696
+ safe_id=self._get_rand_id()
684
697
  )
685
698
 
686
699
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -736,26 +749,37 @@ class SpectralClustering(BaseTransformer):
736
749
  # input cols need to match unquoted / quoted
737
750
  input_cols = self.input_cols
738
751
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
752
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
739
753
 
740
754
  estimator = self._sklearn_object
741
755
 
742
- input_df = dataset[input_cols] # Select input columns with quoted column names.
743
- if hasattr(estimator, "feature_names_in_"):
744
- missing_features = []
745
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
746
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
747
- missing_features.append(f)
748
-
749
- if len(missing_features) > 0:
750
- raise ValueError(
751
- "The feature names should match with those that were passed during fit.\n"
752
- f"Features seen during fit call but not present in the input: {missing_features}\n"
753
- f"Features in the input dataframe : {input_cols}\n"
754
- )
755
- input_df.columns = getattr(estimator, "feature_names_in_")
756
- else:
757
- # Just rename the column names to unquoted identifiers.
758
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
756
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
757
+ missing_features = []
758
+ features_in_dataset = set(dataset.columns)
759
+ columns_to_select = []
760
+ for i, f in enumerate(features_required_by_estimator):
761
+ if (
762
+ i >= len(input_cols)
763
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
764
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
765
+ and quoted_input_cols[i] not in features_in_dataset)
766
+ ):
767
+ missing_features.append(f)
768
+ elif input_cols[i] in features_in_dataset:
769
+ columns_to_select.append(input_cols[i])
770
+ elif unquoted_input_cols[i] in features_in_dataset:
771
+ columns_to_select.append(unquoted_input_cols[i])
772
+ else:
773
+ columns_to_select.append(quoted_input_cols[i])
774
+
775
+ if len(missing_features) > 0:
776
+ raise ValueError(
777
+ "The feature names should match with those that were passed during fit.\n"
778
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
779
+ f"Features in the input dataframe : {input_cols}\n"
780
+ )
781
+ input_df = dataset[columns_to_select]
782
+ input_df.columns = features_required_by_estimator
759
783
 
760
784
  transformed_numpy_array = getattr(estimator, inference_method)(
761
785
  input_df
@@ -834,11 +858,18 @@ class SpectralClustering(BaseTransformer):
834
858
  Transformed dataset.
835
859
  """
836
860
  if isinstance(dataset, DataFrame):
861
+ expected_type_inferred = ""
862
+ # when it is classifier, infer the datatype from label columns
863
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
864
+ expected_type_inferred = convert_sp_to_sf_type(
865
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
866
+ )
867
+
837
868
  output_df = self._batch_inference(
838
869
  dataset=dataset,
839
870
  inference_method="predict",
840
871
  expected_output_cols_list=self.output_cols,
841
- expected_output_cols_type="",
872
+ expected_output_cols_type=expected_type_inferred,
842
873
  )
843
874
  elif isinstance(dataset, pd.DataFrame):
844
875
  output_df = self._sklearn_inference(
@@ -909,10 +940,10 @@ class SpectralClustering(BaseTransformer):
909
940
 
910
941
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
911
942
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
912
- Returns an empty list if current object is not a classifier or not yet fitted.
943
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
913
944
  """
914
945
  if getattr(self._sklearn_object, "classes_", None) is None:
915
- return []
946
+ return [output_cols_prefix]
916
947
 
917
948
  classes = self._sklearn_object.classes_
918
949
  if isinstance(classes, numpy.ndarray):
@@ -1137,7 +1168,7 @@ class SpectralClustering(BaseTransformer):
1137
1168
  cp.dump(self._sklearn_object, local_score_file)
1138
1169
 
1139
1170
  # Create temp stage to run score.
1140
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1171
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1141
1172
  session = dataset._session
1142
1173
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1143
1174
  SqlResultValidator(
@@ -1151,8 +1182,9 @@ class SpectralClustering(BaseTransformer):
1151
1182
  expected_value=f"Stage area {score_stage_name} successfully created."
1152
1183
  ).validate()
1153
1184
 
1154
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1155
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1185
+ # Use posixpath to construct stage paths
1186
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1187
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1156
1188
  statement_params = telemetry.get_function_usage_statement_params(
1157
1189
  project=_PROJECT,
1158
1190
  subproject=_SUBPROJECT,
@@ -1178,6 +1210,7 @@ class SpectralClustering(BaseTransformer):
1178
1210
  replace=True,
1179
1211
  session=session,
1180
1212
  statement_params=statement_params,
1213
+ anonymous=True
1181
1214
  )
1182
1215
  def score_wrapper_sproc(
1183
1216
  session: Session,
@@ -1185,7 +1218,8 @@ class SpectralClustering(BaseTransformer):
1185
1218
  stage_score_file_name: str,
1186
1219
  input_cols: List[str],
1187
1220
  label_cols: List[str],
1188
- sample_weight_col: Optional[str]
1221
+ sample_weight_col: Optional[str],
1222
+ statement_params: Dict[str, str]
1189
1223
  ) -> float:
1190
1224
  import cloudpickle as cp
1191
1225
  import numpy as np
@@ -1235,14 +1269,14 @@ class SpectralClustering(BaseTransformer):
1235
1269
  api_calls=[Session.call],
1236
1270
  custom_tags=dict([("autogen", True)]),
1237
1271
  )
1238
- score = session.call(
1239
- score_sproc_name,
1272
+ score = score_wrapper_sproc(
1273
+ session,
1240
1274
  query,
1241
1275
  stage_score_file_name,
1242
1276
  identifier.get_unescaped_names(self.input_cols),
1243
1277
  identifier.get_unescaped_names(self.label_cols),
1244
1278
  identifier.get_unescaped_names(self.sample_weight_col),
1245
- statement_params=statement_params,
1279
+ statement_params,
1246
1280
  )
1247
1281
 
1248
1282
  cleanup_temp_files([local_score_file_name])
@@ -1260,18 +1294,20 @@ class SpectralClustering(BaseTransformer):
1260
1294
  if self._sklearn_object._estimator_type == 'classifier':
1261
1295
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1262
1296
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1263
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1297
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1298
+ ([] if self._drop_input_cols else inputs) + outputs)
1264
1299
  # For regressor, the type of predict is float64
1265
1300
  elif self._sklearn_object._estimator_type == 'regressor':
1266
1301
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1267
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1268
-
1302
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1303
+ ([] if self._drop_input_cols else inputs) + outputs)
1269
1304
  for prob_func in PROB_FUNCTIONS:
1270
1305
  if hasattr(self, prob_func):
1271
1306
  output_cols_prefix: str = f"{prob_func}_"
1272
1307
  output_column_names = self._get_output_column_names(output_cols_prefix)
1273
1308
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1274
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1309
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1310
+ ([] if self._drop_input_cols else inputs) + outputs)
1275
1311
 
1276
1312
  @property
1277
1313
  def model_signatures(self) -> Dict[str, ModelSignature]: