snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -223,7 +225,6 @@ class SpectralCoclustering(BaseTransformer):
223
225
  sample_weight_col: Optional[str] = None,
224
226
  ) -> None:
225
227
  super().__init__()
226
- self.id = str(uuid4()).replace("-", "_").upper()
227
228
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
228
229
 
229
230
  self._deps = list(deps)
@@ -249,6 +250,15 @@ class SpectralCoclustering(BaseTransformer):
249
250
  self.set_drop_input_cols(drop_input_cols)
250
251
  self.set_sample_weight_col(sample_weight_col)
251
252
 
253
+ def _get_rand_id(self) -> str:
254
+ """
255
+ Generate random id to be used in sproc and stage names.
256
+
257
+ Returns:
258
+ Random id string usable in sproc, table, and stage names.
259
+ """
260
+ return str(uuid4()).replace("-", "_").upper()
261
+
252
262
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
253
263
  """
254
264
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -327,7 +337,7 @@ class SpectralCoclustering(BaseTransformer):
327
337
  cp.dump(self._sklearn_object, local_transform_file)
328
338
 
329
339
  # Create temp stage to run fit.
330
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
340
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
331
341
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
332
342
  SqlResultValidator(
333
343
  session=session,
@@ -340,11 +350,12 @@ class SpectralCoclustering(BaseTransformer):
340
350
  expected_value=f"Stage area {transform_stage_name} successfully created."
341
351
  ).validate()
342
352
 
343
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
353
+ # Use posixpath to construct stage paths
354
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
355
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
344
356
  local_result_file_name = get_temp_file_path()
345
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
346
357
 
347
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
358
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
348
359
  statement_params = telemetry.get_function_usage_statement_params(
349
360
  project=_PROJECT,
350
361
  subproject=_SUBPROJECT,
@@ -370,6 +381,7 @@ class SpectralCoclustering(BaseTransformer):
370
381
  replace=True,
371
382
  session=session,
372
383
  statement_params=statement_params,
384
+ anonymous=True
373
385
  )
374
386
  def fit_wrapper_sproc(
375
387
  session: Session,
@@ -378,7 +390,8 @@ class SpectralCoclustering(BaseTransformer):
378
390
  stage_result_file_name: str,
379
391
  input_cols: List[str],
380
392
  label_cols: List[str],
381
- sample_weight_col: Optional[str]
393
+ sample_weight_col: Optional[str],
394
+ statement_params: Dict[str, str]
382
395
  ) -> str:
383
396
  import cloudpickle as cp
384
397
  import numpy as np
@@ -445,15 +458,15 @@ class SpectralCoclustering(BaseTransformer):
445
458
  api_calls=[Session.call],
446
459
  custom_tags=dict([("autogen", True)]),
447
460
  )
448
- sproc_export_file_name = session.call(
449
- fit_sproc_name,
461
+ sproc_export_file_name = fit_wrapper_sproc(
462
+ session,
450
463
  query,
451
464
  stage_transform_file_name,
452
465
  stage_result_file_name,
453
466
  identifier.get_unescaped_names(self.input_cols),
454
467
  identifier.get_unescaped_names(self.label_cols),
455
468
  identifier.get_unescaped_names(self.sample_weight_col),
456
- statement_params=statement_params,
469
+ statement_params,
457
470
  )
458
471
 
459
472
  if "|" in sproc_export_file_name:
@@ -463,7 +476,7 @@ class SpectralCoclustering(BaseTransformer):
463
476
  print("\n".join(fields[1:]))
464
477
 
465
478
  session.file.get(
466
- os.path.join(stage_result_file_name, sproc_export_file_name),
479
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
467
480
  local_result_file_name,
468
481
  statement_params=statement_params
469
482
  )
@@ -509,7 +522,7 @@ class SpectralCoclustering(BaseTransformer):
509
522
 
510
523
  # Register vectorized UDF for batch inference
511
524
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
512
- safe_id=self.id, method=inference_method)
525
+ safe_id=self._get_rand_id(), method=inference_method)
513
526
 
514
527
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
515
528
  # will try to pickle all of self which fails.
@@ -601,7 +614,7 @@ class SpectralCoclustering(BaseTransformer):
601
614
  return transformed_pandas_df.to_dict("records")
602
615
 
603
616
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
604
- safe_id=self.id
617
+ safe_id=self._get_rand_id()
605
618
  )
606
619
 
607
620
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -657,26 +670,37 @@ class SpectralCoclustering(BaseTransformer):
657
670
  # input cols need to match unquoted / quoted
658
671
  input_cols = self.input_cols
659
672
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
673
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
660
674
 
661
675
  estimator = self._sklearn_object
662
676
 
663
- input_df = dataset[input_cols] # Select input columns with quoted column names.
664
- if hasattr(estimator, "feature_names_in_"):
665
- missing_features = []
666
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
667
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
668
- missing_features.append(f)
669
-
670
- if len(missing_features) > 0:
671
- raise ValueError(
672
- "The feature names should match with those that were passed during fit.\n"
673
- f"Features seen during fit call but not present in the input: {missing_features}\n"
674
- f"Features in the input dataframe : {input_cols}\n"
675
- )
676
- input_df.columns = getattr(estimator, "feature_names_in_")
677
- else:
678
- # Just rename the column names to unquoted identifiers.
679
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
677
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
678
+ missing_features = []
679
+ features_in_dataset = set(dataset.columns)
680
+ columns_to_select = []
681
+ for i, f in enumerate(features_required_by_estimator):
682
+ if (
683
+ i >= len(input_cols)
684
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
685
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
686
+ and quoted_input_cols[i] not in features_in_dataset)
687
+ ):
688
+ missing_features.append(f)
689
+ elif input_cols[i] in features_in_dataset:
690
+ columns_to_select.append(input_cols[i])
691
+ elif unquoted_input_cols[i] in features_in_dataset:
692
+ columns_to_select.append(unquoted_input_cols[i])
693
+ else:
694
+ columns_to_select.append(quoted_input_cols[i])
695
+
696
+ if len(missing_features) > 0:
697
+ raise ValueError(
698
+ "The feature names should match with those that were passed during fit.\n"
699
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
700
+ f"Features in the input dataframe : {input_cols}\n"
701
+ )
702
+ input_df = dataset[columns_to_select]
703
+ input_df.columns = features_required_by_estimator
680
704
 
681
705
  transformed_numpy_array = getattr(estimator, inference_method)(
682
706
  input_df
@@ -755,11 +779,18 @@ class SpectralCoclustering(BaseTransformer):
755
779
  Transformed dataset.
756
780
  """
757
781
  if isinstance(dataset, DataFrame):
782
+ expected_type_inferred = ""
783
+ # when it is classifier, infer the datatype from label columns
784
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
785
+ expected_type_inferred = convert_sp_to_sf_type(
786
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
787
+ )
788
+
758
789
  output_df = self._batch_inference(
759
790
  dataset=dataset,
760
791
  inference_method="predict",
761
792
  expected_output_cols_list=self.output_cols,
762
- expected_output_cols_type="",
793
+ expected_output_cols_type=expected_type_inferred,
763
794
  )
764
795
  elif isinstance(dataset, pd.DataFrame):
765
796
  output_df = self._sklearn_inference(
@@ -830,10 +861,10 @@ class SpectralCoclustering(BaseTransformer):
830
861
 
831
862
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
832
863
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
833
- Returns an empty list if current object is not a classifier or not yet fitted.
864
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
834
865
  """
835
866
  if getattr(self._sklearn_object, "classes_", None) is None:
836
- return []
867
+ return [output_cols_prefix]
837
868
 
838
869
  classes = self._sklearn_object.classes_
839
870
  if isinstance(classes, numpy.ndarray):
@@ -1058,7 +1089,7 @@ class SpectralCoclustering(BaseTransformer):
1058
1089
  cp.dump(self._sklearn_object, local_score_file)
1059
1090
 
1060
1091
  # Create temp stage to run score.
1061
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1092
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1062
1093
  session = dataset._session
1063
1094
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1064
1095
  SqlResultValidator(
@@ -1072,8 +1103,9 @@ class SpectralCoclustering(BaseTransformer):
1072
1103
  expected_value=f"Stage area {score_stage_name} successfully created."
1073
1104
  ).validate()
1074
1105
 
1075
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1076
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1106
+ # Use posixpath to construct stage paths
1107
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1108
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1077
1109
  statement_params = telemetry.get_function_usage_statement_params(
1078
1110
  project=_PROJECT,
1079
1111
  subproject=_SUBPROJECT,
@@ -1099,6 +1131,7 @@ class SpectralCoclustering(BaseTransformer):
1099
1131
  replace=True,
1100
1132
  session=session,
1101
1133
  statement_params=statement_params,
1134
+ anonymous=True
1102
1135
  )
1103
1136
  def score_wrapper_sproc(
1104
1137
  session: Session,
@@ -1106,7 +1139,8 @@ class SpectralCoclustering(BaseTransformer):
1106
1139
  stage_score_file_name: str,
1107
1140
  input_cols: List[str],
1108
1141
  label_cols: List[str],
1109
- sample_weight_col: Optional[str]
1142
+ sample_weight_col: Optional[str],
1143
+ statement_params: Dict[str, str]
1110
1144
  ) -> float:
1111
1145
  import cloudpickle as cp
1112
1146
  import numpy as np
@@ -1156,14 +1190,14 @@ class SpectralCoclustering(BaseTransformer):
1156
1190
  api_calls=[Session.call],
1157
1191
  custom_tags=dict([("autogen", True)]),
1158
1192
  )
1159
- score = session.call(
1160
- score_sproc_name,
1193
+ score = score_wrapper_sproc(
1194
+ session,
1161
1195
  query,
1162
1196
  stage_score_file_name,
1163
1197
  identifier.get_unescaped_names(self.input_cols),
1164
1198
  identifier.get_unescaped_names(self.label_cols),
1165
1199
  identifier.get_unescaped_names(self.sample_weight_col),
1166
- statement_params=statement_params,
1200
+ statement_params,
1167
1201
  )
1168
1202
 
1169
1203
  cleanup_temp_files([local_score_file_name])
@@ -1181,18 +1215,20 @@ class SpectralCoclustering(BaseTransformer):
1181
1215
  if self._sklearn_object._estimator_type == 'classifier':
1182
1216
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1183
1217
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1184
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1218
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1219
+ ([] if self._drop_input_cols else inputs) + outputs)
1185
1220
  # For regressor, the type of predict is float64
1186
1221
  elif self._sklearn_object._estimator_type == 'regressor':
1187
1222
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1188
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1189
-
1223
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1224
+ ([] if self._drop_input_cols else inputs) + outputs)
1190
1225
  for prob_func in PROB_FUNCTIONS:
1191
1226
  if hasattr(self, prob_func):
1192
1227
  output_cols_prefix: str = f"{prob_func}_"
1193
1228
  output_column_names = self._get_output_column_names(output_cols_prefix)
1194
1229
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1195
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1230
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1231
+ ([] if self._drop_input_cols else inputs) + outputs)
1196
1232
 
1197
1233
  @property
1198
1234
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -253,7 +255,6 @@ class ColumnTransformer(BaseTransformer):
253
255
  sample_weight_col: Optional[str] = None,
254
256
  ) -> None:
255
257
  super().__init__()
256
- self.id = str(uuid4()).replace("-", "_").upper()
257
258
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
258
259
  deps = deps | _gather_dependencies(transformers)
259
260
  self._deps = list(deps)
@@ -279,6 +280,15 @@ class ColumnTransformer(BaseTransformer):
279
280
  self.set_drop_input_cols(drop_input_cols)
280
281
  self.set_sample_weight_col(sample_weight_col)
281
282
 
283
+ def _get_rand_id(self) -> str:
284
+ """
285
+ Generate random id to be used in sproc and stage names.
286
+
287
+ Returns:
288
+ Random id string usable in sproc, table, and stage names.
289
+ """
290
+ return str(uuid4()).replace("-", "_").upper()
291
+
282
292
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
283
293
  """
284
294
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -357,7 +367,7 @@ class ColumnTransformer(BaseTransformer):
357
367
  cp.dump(self._sklearn_object, local_transform_file)
358
368
 
359
369
  # Create temp stage to run fit.
360
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
370
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
361
371
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
362
372
  SqlResultValidator(
363
373
  session=session,
@@ -370,11 +380,12 @@ class ColumnTransformer(BaseTransformer):
370
380
  expected_value=f"Stage area {transform_stage_name} successfully created."
371
381
  ).validate()
372
382
 
373
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
383
+ # Use posixpath to construct stage paths
384
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
385
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
374
386
  local_result_file_name = get_temp_file_path()
375
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
376
387
 
377
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
388
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
378
389
  statement_params = telemetry.get_function_usage_statement_params(
379
390
  project=_PROJECT,
380
391
  subproject=_SUBPROJECT,
@@ -400,6 +411,7 @@ class ColumnTransformer(BaseTransformer):
400
411
  replace=True,
401
412
  session=session,
402
413
  statement_params=statement_params,
414
+ anonymous=True
403
415
  )
404
416
  def fit_wrapper_sproc(
405
417
  session: Session,
@@ -408,7 +420,8 @@ class ColumnTransformer(BaseTransformer):
408
420
  stage_result_file_name: str,
409
421
  input_cols: List[str],
410
422
  label_cols: List[str],
411
- sample_weight_col: Optional[str]
423
+ sample_weight_col: Optional[str],
424
+ statement_params: Dict[str, str]
412
425
  ) -> str:
413
426
  import cloudpickle as cp
414
427
  import numpy as np
@@ -475,15 +488,15 @@ class ColumnTransformer(BaseTransformer):
475
488
  api_calls=[Session.call],
476
489
  custom_tags=dict([("autogen", True)]),
477
490
  )
478
- sproc_export_file_name = session.call(
479
- fit_sproc_name,
491
+ sproc_export_file_name = fit_wrapper_sproc(
492
+ session,
480
493
  query,
481
494
  stage_transform_file_name,
482
495
  stage_result_file_name,
483
496
  identifier.get_unescaped_names(self.input_cols),
484
497
  identifier.get_unescaped_names(self.label_cols),
485
498
  identifier.get_unescaped_names(self.sample_weight_col),
486
- statement_params=statement_params,
499
+ statement_params,
487
500
  )
488
501
 
489
502
  if "|" in sproc_export_file_name:
@@ -493,7 +506,7 @@ class ColumnTransformer(BaseTransformer):
493
506
  print("\n".join(fields[1:]))
494
507
 
495
508
  session.file.get(
496
- os.path.join(stage_result_file_name, sproc_export_file_name),
509
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
497
510
  local_result_file_name,
498
511
  statement_params=statement_params
499
512
  )
@@ -539,7 +552,7 @@ class ColumnTransformer(BaseTransformer):
539
552
 
540
553
  # Register vectorized UDF for batch inference
541
554
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
542
- safe_id=self.id, method=inference_method)
555
+ safe_id=self._get_rand_id(), method=inference_method)
543
556
 
544
557
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
545
558
  # will try to pickle all of self which fails.
@@ -631,7 +644,7 @@ class ColumnTransformer(BaseTransformer):
631
644
  return transformed_pandas_df.to_dict("records")
632
645
 
633
646
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
634
- safe_id=self.id
647
+ safe_id=self._get_rand_id()
635
648
  )
636
649
 
637
650
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -687,26 +700,37 @@ class ColumnTransformer(BaseTransformer):
687
700
  # input cols need to match unquoted / quoted
688
701
  input_cols = self.input_cols
689
702
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
703
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
690
704
 
691
705
  estimator = self._sklearn_object
692
706
 
693
- input_df = dataset[input_cols] # Select input columns with quoted column names.
694
- if hasattr(estimator, "feature_names_in_"):
695
- missing_features = []
696
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
697
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
698
- missing_features.append(f)
699
-
700
- if len(missing_features) > 0:
701
- raise ValueError(
702
- "The feature names should match with those that were passed during fit.\n"
703
- f"Features seen during fit call but not present in the input: {missing_features}\n"
704
- f"Features in the input dataframe : {input_cols}\n"
705
- )
706
- input_df.columns = getattr(estimator, "feature_names_in_")
707
- else:
708
- # Just rename the column names to unquoted identifiers.
709
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
707
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
708
+ missing_features = []
709
+ features_in_dataset = set(dataset.columns)
710
+ columns_to_select = []
711
+ for i, f in enumerate(features_required_by_estimator):
712
+ if (
713
+ i >= len(input_cols)
714
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
715
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
716
+ and quoted_input_cols[i] not in features_in_dataset)
717
+ ):
718
+ missing_features.append(f)
719
+ elif input_cols[i] in features_in_dataset:
720
+ columns_to_select.append(input_cols[i])
721
+ elif unquoted_input_cols[i] in features_in_dataset:
722
+ columns_to_select.append(unquoted_input_cols[i])
723
+ else:
724
+ columns_to_select.append(quoted_input_cols[i])
725
+
726
+ if len(missing_features) > 0:
727
+ raise ValueError(
728
+ "The feature names should match with those that were passed during fit.\n"
729
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
730
+ f"Features in the input dataframe : {input_cols}\n"
731
+ )
732
+ input_df = dataset[columns_to_select]
733
+ input_df.columns = features_required_by_estimator
710
734
 
711
735
  transformed_numpy_array = getattr(estimator, inference_method)(
712
736
  input_df
@@ -785,11 +809,18 @@ class ColumnTransformer(BaseTransformer):
785
809
  Transformed dataset.
786
810
  """
787
811
  if isinstance(dataset, DataFrame):
812
+ expected_type_inferred = ""
813
+ # when it is classifier, infer the datatype from label columns
814
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
815
+ expected_type_inferred = convert_sp_to_sf_type(
816
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
817
+ )
818
+
788
819
  output_df = self._batch_inference(
789
820
  dataset=dataset,
790
821
  inference_method="predict",
791
822
  expected_output_cols_list=self.output_cols,
792
- expected_output_cols_type="",
823
+ expected_output_cols_type=expected_type_inferred,
793
824
  )
794
825
  elif isinstance(dataset, pd.DataFrame):
795
826
  output_df = self._sklearn_inference(
@@ -862,10 +893,10 @@ class ColumnTransformer(BaseTransformer):
862
893
 
863
894
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
864
895
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
865
- Returns an empty list if current object is not a classifier or not yet fitted.
896
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
866
897
  """
867
898
  if getattr(self._sklearn_object, "classes_", None) is None:
868
- return []
899
+ return [output_cols_prefix]
869
900
 
870
901
  classes = self._sklearn_object.classes_
871
902
  if isinstance(classes, numpy.ndarray):
@@ -1090,7 +1121,7 @@ class ColumnTransformer(BaseTransformer):
1090
1121
  cp.dump(self._sklearn_object, local_score_file)
1091
1122
 
1092
1123
  # Create temp stage to run score.
1093
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1124
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1094
1125
  session = dataset._session
1095
1126
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1096
1127
  SqlResultValidator(
@@ -1104,8 +1135,9 @@ class ColumnTransformer(BaseTransformer):
1104
1135
  expected_value=f"Stage area {score_stage_name} successfully created."
1105
1136
  ).validate()
1106
1137
 
1107
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1108
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1138
+ # Use posixpath to construct stage paths
1139
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1140
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1109
1141
  statement_params = telemetry.get_function_usage_statement_params(
1110
1142
  project=_PROJECT,
1111
1143
  subproject=_SUBPROJECT,
@@ -1131,6 +1163,7 @@ class ColumnTransformer(BaseTransformer):
1131
1163
  replace=True,
1132
1164
  session=session,
1133
1165
  statement_params=statement_params,
1166
+ anonymous=True
1134
1167
  )
1135
1168
  def score_wrapper_sproc(
1136
1169
  session: Session,
@@ -1138,7 +1171,8 @@ class ColumnTransformer(BaseTransformer):
1138
1171
  stage_score_file_name: str,
1139
1172
  input_cols: List[str],
1140
1173
  label_cols: List[str],
1141
- sample_weight_col: Optional[str]
1174
+ sample_weight_col: Optional[str],
1175
+ statement_params: Dict[str, str]
1142
1176
  ) -> float:
1143
1177
  import cloudpickle as cp
1144
1178
  import numpy as np
@@ -1188,14 +1222,14 @@ class ColumnTransformer(BaseTransformer):
1188
1222
  api_calls=[Session.call],
1189
1223
  custom_tags=dict([("autogen", True)]),
1190
1224
  )
1191
- score = session.call(
1192
- score_sproc_name,
1225
+ score = score_wrapper_sproc(
1226
+ session,
1193
1227
  query,
1194
1228
  stage_score_file_name,
1195
1229
  identifier.get_unescaped_names(self.input_cols),
1196
1230
  identifier.get_unescaped_names(self.label_cols),
1197
1231
  identifier.get_unescaped_names(self.sample_weight_col),
1198
- statement_params=statement_params,
1232
+ statement_params,
1199
1233
  )
1200
1234
 
1201
1235
  cleanup_temp_files([local_score_file_name])
@@ -1213,18 +1247,20 @@ class ColumnTransformer(BaseTransformer):
1213
1247
  if self._sklearn_object._estimator_type == 'classifier':
1214
1248
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1215
1249
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1216
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1250
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1251
+ ([] if self._drop_input_cols else inputs) + outputs)
1217
1252
  # For regressor, the type of predict is float64
1218
1253
  elif self._sklearn_object._estimator_type == 'regressor':
1219
1254
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1220
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1221
-
1255
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1256
+ ([] if self._drop_input_cols else inputs) + outputs)
1222
1257
  for prob_func in PROB_FUNCTIONS:
1223
1258
  if hasattr(self, prob_func):
1224
1259
  output_cols_prefix: str = f"{prob_func}_"
1225
1260
  output_column_names = self._get_output_column_names(output_cols_prefix)
1226
1261
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1227
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1262
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1263
+ ([] if self._drop_input_cols else inputs) + outputs)
1228
1264
 
1229
1265
  @property
1230
1266
  def model_signatures(self) -> Dict[str, ModelSignature]: