snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -298,7 +300,6 @@ class BayesianGaussianMixture(BaseTransformer):
298
300
  sample_weight_col: Optional[str] = None,
299
301
  ) -> None:
300
302
  super().__init__()
301
- self.id = str(uuid4()).replace("-", "_").upper()
302
303
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
303
304
 
304
305
  self._deps = list(deps)
@@ -334,6 +335,15 @@ class BayesianGaussianMixture(BaseTransformer):
334
335
  self.set_drop_input_cols(drop_input_cols)
335
336
  self.set_sample_weight_col(sample_weight_col)
336
337
 
338
+ def _get_rand_id(self) -> str:
339
+ """
340
+ Generate random id to be used in sproc and stage names.
341
+
342
+ Returns:
343
+ Random id string usable in sproc, table, and stage names.
344
+ """
345
+ return str(uuid4()).replace("-", "_").upper()
346
+
337
347
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
338
348
  """
339
349
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -412,7 +422,7 @@ class BayesianGaussianMixture(BaseTransformer):
412
422
  cp.dump(self._sklearn_object, local_transform_file)
413
423
 
414
424
  # Create temp stage to run fit.
415
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
425
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
416
426
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
417
427
  SqlResultValidator(
418
428
  session=session,
@@ -425,11 +435,12 @@ class BayesianGaussianMixture(BaseTransformer):
425
435
  expected_value=f"Stage area {transform_stage_name} successfully created."
426
436
  ).validate()
427
437
 
428
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
438
+ # Use posixpath to construct stage paths
439
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
440
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
429
441
  local_result_file_name = get_temp_file_path()
430
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
431
442
 
432
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
443
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
433
444
  statement_params = telemetry.get_function_usage_statement_params(
434
445
  project=_PROJECT,
435
446
  subproject=_SUBPROJECT,
@@ -455,6 +466,7 @@ class BayesianGaussianMixture(BaseTransformer):
455
466
  replace=True,
456
467
  session=session,
457
468
  statement_params=statement_params,
469
+ anonymous=True
458
470
  )
459
471
  def fit_wrapper_sproc(
460
472
  session: Session,
@@ -463,7 +475,8 @@ class BayesianGaussianMixture(BaseTransformer):
463
475
  stage_result_file_name: str,
464
476
  input_cols: List[str],
465
477
  label_cols: List[str],
466
- sample_weight_col: Optional[str]
478
+ sample_weight_col: Optional[str],
479
+ statement_params: Dict[str, str]
467
480
  ) -> str:
468
481
  import cloudpickle as cp
469
482
  import numpy as np
@@ -530,15 +543,15 @@ class BayesianGaussianMixture(BaseTransformer):
530
543
  api_calls=[Session.call],
531
544
  custom_tags=dict([("autogen", True)]),
532
545
  )
533
- sproc_export_file_name = session.call(
534
- fit_sproc_name,
546
+ sproc_export_file_name = fit_wrapper_sproc(
547
+ session,
535
548
  query,
536
549
  stage_transform_file_name,
537
550
  stage_result_file_name,
538
551
  identifier.get_unescaped_names(self.input_cols),
539
552
  identifier.get_unescaped_names(self.label_cols),
540
553
  identifier.get_unescaped_names(self.sample_weight_col),
541
- statement_params=statement_params,
554
+ statement_params,
542
555
  )
543
556
 
544
557
  if "|" in sproc_export_file_name:
@@ -548,7 +561,7 @@ class BayesianGaussianMixture(BaseTransformer):
548
561
  print("\n".join(fields[1:]))
549
562
 
550
563
  session.file.get(
551
- os.path.join(stage_result_file_name, sproc_export_file_name),
564
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
552
565
  local_result_file_name,
553
566
  statement_params=statement_params
554
567
  )
@@ -594,7 +607,7 @@ class BayesianGaussianMixture(BaseTransformer):
594
607
 
595
608
  # Register vectorized UDF for batch inference
596
609
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
597
- safe_id=self.id, method=inference_method)
610
+ safe_id=self._get_rand_id(), method=inference_method)
598
611
 
599
612
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
600
613
  # will try to pickle all of self which fails.
@@ -686,7 +699,7 @@ class BayesianGaussianMixture(BaseTransformer):
686
699
  return transformed_pandas_df.to_dict("records")
687
700
 
688
701
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
689
- safe_id=self.id
702
+ safe_id=self._get_rand_id()
690
703
  )
691
704
 
692
705
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -742,26 +755,37 @@ class BayesianGaussianMixture(BaseTransformer):
742
755
  # input cols need to match unquoted / quoted
743
756
  input_cols = self.input_cols
744
757
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
758
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
745
759
 
746
760
  estimator = self._sklearn_object
747
761
 
748
- input_df = dataset[input_cols] # Select input columns with quoted column names.
749
- if hasattr(estimator, "feature_names_in_"):
750
- missing_features = []
751
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
752
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
753
- missing_features.append(f)
754
-
755
- if len(missing_features) > 0:
756
- raise ValueError(
757
- "The feature names should match with those that were passed during fit.\n"
758
- f"Features seen during fit call but not present in the input: {missing_features}\n"
759
- f"Features in the input dataframe : {input_cols}\n"
760
- )
761
- input_df.columns = getattr(estimator, "feature_names_in_")
762
- else:
763
- # Just rename the column names to unquoted identifiers.
764
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
762
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
763
+ missing_features = []
764
+ features_in_dataset = set(dataset.columns)
765
+ columns_to_select = []
766
+ for i, f in enumerate(features_required_by_estimator):
767
+ if (
768
+ i >= len(input_cols)
769
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
770
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
771
+ and quoted_input_cols[i] not in features_in_dataset)
772
+ ):
773
+ missing_features.append(f)
774
+ elif input_cols[i] in features_in_dataset:
775
+ columns_to_select.append(input_cols[i])
776
+ elif unquoted_input_cols[i] in features_in_dataset:
777
+ columns_to_select.append(unquoted_input_cols[i])
778
+ else:
779
+ columns_to_select.append(quoted_input_cols[i])
780
+
781
+ if len(missing_features) > 0:
782
+ raise ValueError(
783
+ "The feature names should match with those that were passed during fit.\n"
784
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
785
+ f"Features in the input dataframe : {input_cols}\n"
786
+ )
787
+ input_df = dataset[columns_to_select]
788
+ input_df.columns = features_required_by_estimator
765
789
 
766
790
  transformed_numpy_array = getattr(estimator, inference_method)(
767
791
  input_df
@@ -842,11 +866,18 @@ class BayesianGaussianMixture(BaseTransformer):
842
866
  Transformed dataset.
843
867
  """
844
868
  if isinstance(dataset, DataFrame):
869
+ expected_type_inferred = ""
870
+ # when it is classifier, infer the datatype from label columns
871
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
872
+ expected_type_inferred = convert_sp_to_sf_type(
873
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
874
+ )
875
+
845
876
  output_df = self._batch_inference(
846
877
  dataset=dataset,
847
878
  inference_method="predict",
848
879
  expected_output_cols_list=self.output_cols,
849
- expected_output_cols_type="",
880
+ expected_output_cols_type=expected_type_inferred,
850
881
  )
851
882
  elif isinstance(dataset, pd.DataFrame):
852
883
  output_df = self._sklearn_inference(
@@ -917,10 +948,10 @@ class BayesianGaussianMixture(BaseTransformer):
917
948
 
918
949
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
919
950
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
920
- Returns an empty list if current object is not a classifier or not yet fitted.
951
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
921
952
  """
922
953
  if getattr(self._sklearn_object, "classes_", None) is None:
923
- return []
954
+ return [output_cols_prefix]
924
955
 
925
956
  classes = self._sklearn_object.classes_
926
957
  if isinstance(classes, numpy.ndarray):
@@ -1149,7 +1180,7 @@ class BayesianGaussianMixture(BaseTransformer):
1149
1180
  cp.dump(self._sklearn_object, local_score_file)
1150
1181
 
1151
1182
  # Create temp stage to run score.
1152
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1183
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1153
1184
  session = dataset._session
1154
1185
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1155
1186
  SqlResultValidator(
@@ -1163,8 +1194,9 @@ class BayesianGaussianMixture(BaseTransformer):
1163
1194
  expected_value=f"Stage area {score_stage_name} successfully created."
1164
1195
  ).validate()
1165
1196
 
1166
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1167
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1197
+ # Use posixpath to construct stage paths
1198
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1199
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1168
1200
  statement_params = telemetry.get_function_usage_statement_params(
1169
1201
  project=_PROJECT,
1170
1202
  subproject=_SUBPROJECT,
@@ -1190,6 +1222,7 @@ class BayesianGaussianMixture(BaseTransformer):
1190
1222
  replace=True,
1191
1223
  session=session,
1192
1224
  statement_params=statement_params,
1225
+ anonymous=True
1193
1226
  )
1194
1227
  def score_wrapper_sproc(
1195
1228
  session: Session,
@@ -1197,7 +1230,8 @@ class BayesianGaussianMixture(BaseTransformer):
1197
1230
  stage_score_file_name: str,
1198
1231
  input_cols: List[str],
1199
1232
  label_cols: List[str],
1200
- sample_weight_col: Optional[str]
1233
+ sample_weight_col: Optional[str],
1234
+ statement_params: Dict[str, str]
1201
1235
  ) -> float:
1202
1236
  import cloudpickle as cp
1203
1237
  import numpy as np
@@ -1247,14 +1281,14 @@ class BayesianGaussianMixture(BaseTransformer):
1247
1281
  api_calls=[Session.call],
1248
1282
  custom_tags=dict([("autogen", True)]),
1249
1283
  )
1250
- score = session.call(
1251
- score_sproc_name,
1284
+ score = score_wrapper_sproc(
1285
+ session,
1252
1286
  query,
1253
1287
  stage_score_file_name,
1254
1288
  identifier.get_unescaped_names(self.input_cols),
1255
1289
  identifier.get_unescaped_names(self.label_cols),
1256
1290
  identifier.get_unescaped_names(self.sample_weight_col),
1257
- statement_params=statement_params,
1291
+ statement_params,
1258
1292
  )
1259
1293
 
1260
1294
  cleanup_temp_files([local_score_file_name])
@@ -1272,18 +1306,20 @@ class BayesianGaussianMixture(BaseTransformer):
1272
1306
  if self._sklearn_object._estimator_type == 'classifier':
1273
1307
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1274
1308
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1275
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1309
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1310
+ ([] if self._drop_input_cols else inputs) + outputs)
1276
1311
  # For regressor, the type of predict is float64
1277
1312
  elif self._sklearn_object._estimator_type == 'regressor':
1278
1313
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1279
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1280
-
1314
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1315
+ ([] if self._drop_input_cols else inputs) + outputs)
1281
1316
  for prob_func in PROB_FUNCTIONS:
1282
1317
  if hasattr(self, prob_func):
1283
1318
  output_cols_prefix: str = f"{prob_func}_"
1284
1319
  output_column_names = self._get_output_column_names(output_cols_prefix)
1285
1320
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1286
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1321
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1322
+ ([] if self._drop_input_cols else inputs) + outputs)
1287
1323
 
1288
1324
  @property
1289
1325
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -274,7 +276,6 @@ class GaussianMixture(BaseTransformer):
274
276
  sample_weight_col: Optional[str] = None,
275
277
  ) -> None:
276
278
  super().__init__()
277
- self.id = str(uuid4()).replace("-", "_").upper()
278
279
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
279
280
 
280
281
  self._deps = list(deps)
@@ -307,6 +308,15 @@ class GaussianMixture(BaseTransformer):
307
308
  self.set_drop_input_cols(drop_input_cols)
308
309
  self.set_sample_weight_col(sample_weight_col)
309
310
 
311
+ def _get_rand_id(self) -> str:
312
+ """
313
+ Generate random id to be used in sproc and stage names.
314
+
315
+ Returns:
316
+ Random id string usable in sproc, table, and stage names.
317
+ """
318
+ return str(uuid4()).replace("-", "_").upper()
319
+
310
320
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
311
321
  """
312
322
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -385,7 +395,7 @@ class GaussianMixture(BaseTransformer):
385
395
  cp.dump(self._sklearn_object, local_transform_file)
386
396
 
387
397
  # Create temp stage to run fit.
388
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
398
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
389
399
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
390
400
  SqlResultValidator(
391
401
  session=session,
@@ -398,11 +408,12 @@ class GaussianMixture(BaseTransformer):
398
408
  expected_value=f"Stage area {transform_stage_name} successfully created."
399
409
  ).validate()
400
410
 
401
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
411
+ # Use posixpath to construct stage paths
412
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
413
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
402
414
  local_result_file_name = get_temp_file_path()
403
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
404
415
 
405
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
416
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
406
417
  statement_params = telemetry.get_function_usage_statement_params(
407
418
  project=_PROJECT,
408
419
  subproject=_SUBPROJECT,
@@ -428,6 +439,7 @@ class GaussianMixture(BaseTransformer):
428
439
  replace=True,
429
440
  session=session,
430
441
  statement_params=statement_params,
442
+ anonymous=True
431
443
  )
432
444
  def fit_wrapper_sproc(
433
445
  session: Session,
@@ -436,7 +448,8 @@ class GaussianMixture(BaseTransformer):
436
448
  stage_result_file_name: str,
437
449
  input_cols: List[str],
438
450
  label_cols: List[str],
439
- sample_weight_col: Optional[str]
451
+ sample_weight_col: Optional[str],
452
+ statement_params: Dict[str, str]
440
453
  ) -> str:
441
454
  import cloudpickle as cp
442
455
  import numpy as np
@@ -503,15 +516,15 @@ class GaussianMixture(BaseTransformer):
503
516
  api_calls=[Session.call],
504
517
  custom_tags=dict([("autogen", True)]),
505
518
  )
506
- sproc_export_file_name = session.call(
507
- fit_sproc_name,
519
+ sproc_export_file_name = fit_wrapper_sproc(
520
+ session,
508
521
  query,
509
522
  stage_transform_file_name,
510
523
  stage_result_file_name,
511
524
  identifier.get_unescaped_names(self.input_cols),
512
525
  identifier.get_unescaped_names(self.label_cols),
513
526
  identifier.get_unescaped_names(self.sample_weight_col),
514
- statement_params=statement_params,
527
+ statement_params,
515
528
  )
516
529
 
517
530
  if "|" in sproc_export_file_name:
@@ -521,7 +534,7 @@ class GaussianMixture(BaseTransformer):
521
534
  print("\n".join(fields[1:]))
522
535
 
523
536
  session.file.get(
524
- os.path.join(stage_result_file_name, sproc_export_file_name),
537
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
525
538
  local_result_file_name,
526
539
  statement_params=statement_params
527
540
  )
@@ -567,7 +580,7 @@ class GaussianMixture(BaseTransformer):
567
580
 
568
581
  # Register vectorized UDF for batch inference
569
582
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
570
- safe_id=self.id, method=inference_method)
583
+ safe_id=self._get_rand_id(), method=inference_method)
571
584
 
572
585
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
573
586
  # will try to pickle all of self which fails.
@@ -659,7 +672,7 @@ class GaussianMixture(BaseTransformer):
659
672
  return transformed_pandas_df.to_dict("records")
660
673
 
661
674
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
662
- safe_id=self.id
675
+ safe_id=self._get_rand_id()
663
676
  )
664
677
 
665
678
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -715,26 +728,37 @@ class GaussianMixture(BaseTransformer):
715
728
  # input cols need to match unquoted / quoted
716
729
  input_cols = self.input_cols
717
730
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
731
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
718
732
 
719
733
  estimator = self._sklearn_object
720
734
 
721
- input_df = dataset[input_cols] # Select input columns with quoted column names.
722
- if hasattr(estimator, "feature_names_in_"):
723
- missing_features = []
724
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
725
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
726
- missing_features.append(f)
727
-
728
- if len(missing_features) > 0:
729
- raise ValueError(
730
- "The feature names should match with those that were passed during fit.\n"
731
- f"Features seen during fit call but not present in the input: {missing_features}\n"
732
- f"Features in the input dataframe : {input_cols}\n"
733
- )
734
- input_df.columns = getattr(estimator, "feature_names_in_")
735
- else:
736
- # Just rename the column names to unquoted identifiers.
737
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
735
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
736
+ missing_features = []
737
+ features_in_dataset = set(dataset.columns)
738
+ columns_to_select = []
739
+ for i, f in enumerate(features_required_by_estimator):
740
+ if (
741
+ i >= len(input_cols)
742
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
743
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
744
+ and quoted_input_cols[i] not in features_in_dataset)
745
+ ):
746
+ missing_features.append(f)
747
+ elif input_cols[i] in features_in_dataset:
748
+ columns_to_select.append(input_cols[i])
749
+ elif unquoted_input_cols[i] in features_in_dataset:
750
+ columns_to_select.append(unquoted_input_cols[i])
751
+ else:
752
+ columns_to_select.append(quoted_input_cols[i])
753
+
754
+ if len(missing_features) > 0:
755
+ raise ValueError(
756
+ "The feature names should match with those that were passed during fit.\n"
757
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
758
+ f"Features in the input dataframe : {input_cols}\n"
759
+ )
760
+ input_df = dataset[columns_to_select]
761
+ input_df.columns = features_required_by_estimator
738
762
 
739
763
  transformed_numpy_array = getattr(estimator, inference_method)(
740
764
  input_df
@@ -815,11 +839,18 @@ class GaussianMixture(BaseTransformer):
815
839
  Transformed dataset.
816
840
  """
817
841
  if isinstance(dataset, DataFrame):
842
+ expected_type_inferred = ""
843
+ # when it is classifier, infer the datatype from label columns
844
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
845
+ expected_type_inferred = convert_sp_to_sf_type(
846
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
847
+ )
848
+
818
849
  output_df = self._batch_inference(
819
850
  dataset=dataset,
820
851
  inference_method="predict",
821
852
  expected_output_cols_list=self.output_cols,
822
- expected_output_cols_type="",
853
+ expected_output_cols_type=expected_type_inferred,
823
854
  )
824
855
  elif isinstance(dataset, pd.DataFrame):
825
856
  output_df = self._sklearn_inference(
@@ -890,10 +921,10 @@ class GaussianMixture(BaseTransformer):
890
921
 
891
922
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
892
923
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
893
- Returns an empty list if current object is not a classifier or not yet fitted.
924
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
894
925
  """
895
926
  if getattr(self._sklearn_object, "classes_", None) is None:
896
- return []
927
+ return [output_cols_prefix]
897
928
 
898
929
  classes = self._sklearn_object.classes_
899
930
  if isinstance(classes, numpy.ndarray):
@@ -1122,7 +1153,7 @@ class GaussianMixture(BaseTransformer):
1122
1153
  cp.dump(self._sklearn_object, local_score_file)
1123
1154
 
1124
1155
  # Create temp stage to run score.
1125
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1156
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1126
1157
  session = dataset._session
1127
1158
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1128
1159
  SqlResultValidator(
@@ -1136,8 +1167,9 @@ class GaussianMixture(BaseTransformer):
1136
1167
  expected_value=f"Stage area {score_stage_name} successfully created."
1137
1168
  ).validate()
1138
1169
 
1139
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1140
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1170
+ # Use posixpath to construct stage paths
1171
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1172
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1141
1173
  statement_params = telemetry.get_function_usage_statement_params(
1142
1174
  project=_PROJECT,
1143
1175
  subproject=_SUBPROJECT,
@@ -1163,6 +1195,7 @@ class GaussianMixture(BaseTransformer):
1163
1195
  replace=True,
1164
1196
  session=session,
1165
1197
  statement_params=statement_params,
1198
+ anonymous=True
1166
1199
  )
1167
1200
  def score_wrapper_sproc(
1168
1201
  session: Session,
@@ -1170,7 +1203,8 @@ class GaussianMixture(BaseTransformer):
1170
1203
  stage_score_file_name: str,
1171
1204
  input_cols: List[str],
1172
1205
  label_cols: List[str],
1173
- sample_weight_col: Optional[str]
1206
+ sample_weight_col: Optional[str],
1207
+ statement_params: Dict[str, str]
1174
1208
  ) -> float:
1175
1209
  import cloudpickle as cp
1176
1210
  import numpy as np
@@ -1220,14 +1254,14 @@ class GaussianMixture(BaseTransformer):
1220
1254
  api_calls=[Session.call],
1221
1255
  custom_tags=dict([("autogen", True)]),
1222
1256
  )
1223
- score = session.call(
1224
- score_sproc_name,
1257
+ score = score_wrapper_sproc(
1258
+ session,
1225
1259
  query,
1226
1260
  stage_score_file_name,
1227
1261
  identifier.get_unescaped_names(self.input_cols),
1228
1262
  identifier.get_unescaped_names(self.label_cols),
1229
1263
  identifier.get_unescaped_names(self.sample_weight_col),
1230
- statement_params=statement_params,
1264
+ statement_params,
1231
1265
  )
1232
1266
 
1233
1267
  cleanup_temp_files([local_score_file_name])
@@ -1245,18 +1279,20 @@ class GaussianMixture(BaseTransformer):
1245
1279
  if self._sklearn_object._estimator_type == 'classifier':
1246
1280
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1247
1281
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1248
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1282
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1283
+ ([] if self._drop_input_cols else inputs) + outputs)
1249
1284
  # For regressor, the type of predict is float64
1250
1285
  elif self._sklearn_object._estimator_type == 'regressor':
1251
1286
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1252
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1253
-
1287
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1288
+ ([] if self._drop_input_cols else inputs) + outputs)
1254
1289
  for prob_func in PROB_FUNCTIONS:
1255
1290
  if hasattr(self, prob_func):
1256
1291
  output_cols_prefix: str = f"{prob_func}_"
1257
1292
  output_column_names = self._get_output_column_names(output_cols_prefix)
1258
1293
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1259
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1294
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1295
+ ([] if self._drop_input_cols else inputs) + outputs)
1260
1296
 
1261
1297
  @property
1262
1298
  def model_signatures(self) -> Dict[str, ModelSignature]: