snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -328,7 +330,6 @@ class HistGradientBoostingRegressor(BaseTransformer):
328
330
  sample_weight_col: Optional[str] = None,
329
331
  ) -> None:
330
332
  super().__init__()
331
- self.id = str(uuid4()).replace("-", "_").upper()
332
333
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
333
334
 
334
335
  self._deps = list(deps)
@@ -367,6 +368,15 @@ class HistGradientBoostingRegressor(BaseTransformer):
367
368
  self.set_drop_input_cols(drop_input_cols)
368
369
  self.set_sample_weight_col(sample_weight_col)
369
370
 
371
+ def _get_rand_id(self) -> str:
372
+ """
373
+ Generate random id to be used in sproc and stage names.
374
+
375
+ Returns:
376
+ Random id string usable in sproc, table, and stage names.
377
+ """
378
+ return str(uuid4()).replace("-", "_").upper()
379
+
370
380
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
371
381
  """
372
382
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -445,7 +455,7 @@ class HistGradientBoostingRegressor(BaseTransformer):
445
455
  cp.dump(self._sklearn_object, local_transform_file)
446
456
 
447
457
  # Create temp stage to run fit.
448
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
458
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
449
459
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
450
460
  SqlResultValidator(
451
461
  session=session,
@@ -458,11 +468,12 @@ class HistGradientBoostingRegressor(BaseTransformer):
458
468
  expected_value=f"Stage area {transform_stage_name} successfully created."
459
469
  ).validate()
460
470
 
461
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
471
+ # Use posixpath to construct stage paths
472
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
473
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
462
474
  local_result_file_name = get_temp_file_path()
463
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
464
475
 
465
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
476
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
466
477
  statement_params = telemetry.get_function_usage_statement_params(
467
478
  project=_PROJECT,
468
479
  subproject=_SUBPROJECT,
@@ -488,6 +499,7 @@ class HistGradientBoostingRegressor(BaseTransformer):
488
499
  replace=True,
489
500
  session=session,
490
501
  statement_params=statement_params,
502
+ anonymous=True
491
503
  )
492
504
  def fit_wrapper_sproc(
493
505
  session: Session,
@@ -496,7 +508,8 @@ class HistGradientBoostingRegressor(BaseTransformer):
496
508
  stage_result_file_name: str,
497
509
  input_cols: List[str],
498
510
  label_cols: List[str],
499
- sample_weight_col: Optional[str]
511
+ sample_weight_col: Optional[str],
512
+ statement_params: Dict[str, str]
500
513
  ) -> str:
501
514
  import cloudpickle as cp
502
515
  import numpy as np
@@ -563,15 +576,15 @@ class HistGradientBoostingRegressor(BaseTransformer):
563
576
  api_calls=[Session.call],
564
577
  custom_tags=dict([("autogen", True)]),
565
578
  )
566
- sproc_export_file_name = session.call(
567
- fit_sproc_name,
579
+ sproc_export_file_name = fit_wrapper_sproc(
580
+ session,
568
581
  query,
569
582
  stage_transform_file_name,
570
583
  stage_result_file_name,
571
584
  identifier.get_unescaped_names(self.input_cols),
572
585
  identifier.get_unescaped_names(self.label_cols),
573
586
  identifier.get_unescaped_names(self.sample_weight_col),
574
- statement_params=statement_params,
587
+ statement_params,
575
588
  )
576
589
 
577
590
  if "|" in sproc_export_file_name:
@@ -581,7 +594,7 @@ class HistGradientBoostingRegressor(BaseTransformer):
581
594
  print("\n".join(fields[1:]))
582
595
 
583
596
  session.file.get(
584
- os.path.join(stage_result_file_name, sproc_export_file_name),
597
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
585
598
  local_result_file_name,
586
599
  statement_params=statement_params
587
600
  )
@@ -627,7 +640,7 @@ class HistGradientBoostingRegressor(BaseTransformer):
627
640
 
628
641
  # Register vectorized UDF for batch inference
629
642
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
630
- safe_id=self.id, method=inference_method)
643
+ safe_id=self._get_rand_id(), method=inference_method)
631
644
 
632
645
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
633
646
  # will try to pickle all of self which fails.
@@ -719,7 +732,7 @@ class HistGradientBoostingRegressor(BaseTransformer):
719
732
  return transformed_pandas_df.to_dict("records")
720
733
 
721
734
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
722
- safe_id=self.id
735
+ safe_id=self._get_rand_id()
723
736
  )
724
737
 
725
738
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -775,26 +788,37 @@ class HistGradientBoostingRegressor(BaseTransformer):
775
788
  # input cols need to match unquoted / quoted
776
789
  input_cols = self.input_cols
777
790
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
791
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
778
792
 
779
793
  estimator = self._sklearn_object
780
794
 
781
- input_df = dataset[input_cols] # Select input columns with quoted column names.
782
- if hasattr(estimator, "feature_names_in_"):
783
- missing_features = []
784
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
785
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
786
- missing_features.append(f)
787
-
788
- if len(missing_features) > 0:
789
- raise ValueError(
790
- "The feature names should match with those that were passed during fit.\n"
791
- f"Features seen during fit call but not present in the input: {missing_features}\n"
792
- f"Features in the input dataframe : {input_cols}\n"
793
- )
794
- input_df.columns = getattr(estimator, "feature_names_in_")
795
- else:
796
- # Just rename the column names to unquoted identifiers.
797
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
795
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
796
+ missing_features = []
797
+ features_in_dataset = set(dataset.columns)
798
+ columns_to_select = []
799
+ for i, f in enumerate(features_required_by_estimator):
800
+ if (
801
+ i >= len(input_cols)
802
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
803
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
804
+ and quoted_input_cols[i] not in features_in_dataset)
805
+ ):
806
+ missing_features.append(f)
807
+ elif input_cols[i] in features_in_dataset:
808
+ columns_to_select.append(input_cols[i])
809
+ elif unquoted_input_cols[i] in features_in_dataset:
810
+ columns_to_select.append(unquoted_input_cols[i])
811
+ else:
812
+ columns_to_select.append(quoted_input_cols[i])
813
+
814
+ if len(missing_features) > 0:
815
+ raise ValueError(
816
+ "The feature names should match with those that were passed during fit.\n"
817
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
818
+ f"Features in the input dataframe : {input_cols}\n"
819
+ )
820
+ input_df = dataset[columns_to_select]
821
+ input_df.columns = features_required_by_estimator
798
822
 
799
823
  transformed_numpy_array = getattr(estimator, inference_method)(
800
824
  input_df
@@ -875,11 +899,18 @@ class HistGradientBoostingRegressor(BaseTransformer):
875
899
  Transformed dataset.
876
900
  """
877
901
  if isinstance(dataset, DataFrame):
902
+ expected_type_inferred = "float"
903
+ # when it is classifier, infer the datatype from label columns
904
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
905
+ expected_type_inferred = convert_sp_to_sf_type(
906
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
907
+ )
908
+
878
909
  output_df = self._batch_inference(
879
910
  dataset=dataset,
880
911
  inference_method="predict",
881
912
  expected_output_cols_list=self.output_cols,
882
- expected_output_cols_type="float",
913
+ expected_output_cols_type=expected_type_inferred,
883
914
  )
884
915
  elif isinstance(dataset, pd.DataFrame):
885
916
  output_df = self._sklearn_inference(
@@ -950,10 +981,10 @@ class HistGradientBoostingRegressor(BaseTransformer):
950
981
 
951
982
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
952
983
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
953
- Returns an empty list if current object is not a classifier or not yet fitted.
984
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
954
985
  """
955
986
  if getattr(self._sklearn_object, "classes_", None) is None:
956
- return []
987
+ return [output_cols_prefix]
957
988
 
958
989
  classes = self._sklearn_object.classes_
959
990
  if isinstance(classes, numpy.ndarray):
@@ -1178,7 +1209,7 @@ class HistGradientBoostingRegressor(BaseTransformer):
1178
1209
  cp.dump(self._sklearn_object, local_score_file)
1179
1210
 
1180
1211
  # Create temp stage to run score.
1181
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1212
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1182
1213
  session = dataset._session
1183
1214
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1184
1215
  SqlResultValidator(
@@ -1192,8 +1223,9 @@ class HistGradientBoostingRegressor(BaseTransformer):
1192
1223
  expected_value=f"Stage area {score_stage_name} successfully created."
1193
1224
  ).validate()
1194
1225
 
1195
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1196
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1226
+ # Use posixpath to construct stage paths
1227
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1228
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1197
1229
  statement_params = telemetry.get_function_usage_statement_params(
1198
1230
  project=_PROJECT,
1199
1231
  subproject=_SUBPROJECT,
@@ -1219,6 +1251,7 @@ class HistGradientBoostingRegressor(BaseTransformer):
1219
1251
  replace=True,
1220
1252
  session=session,
1221
1253
  statement_params=statement_params,
1254
+ anonymous=True
1222
1255
  )
1223
1256
  def score_wrapper_sproc(
1224
1257
  session: Session,
@@ -1226,7 +1259,8 @@ class HistGradientBoostingRegressor(BaseTransformer):
1226
1259
  stage_score_file_name: str,
1227
1260
  input_cols: List[str],
1228
1261
  label_cols: List[str],
1229
- sample_weight_col: Optional[str]
1262
+ sample_weight_col: Optional[str],
1263
+ statement_params: Dict[str, str]
1230
1264
  ) -> float:
1231
1265
  import cloudpickle as cp
1232
1266
  import numpy as np
@@ -1276,14 +1310,14 @@ class HistGradientBoostingRegressor(BaseTransformer):
1276
1310
  api_calls=[Session.call],
1277
1311
  custom_tags=dict([("autogen", True)]),
1278
1312
  )
1279
- score = session.call(
1280
- score_sproc_name,
1313
+ score = score_wrapper_sproc(
1314
+ session,
1281
1315
  query,
1282
1316
  stage_score_file_name,
1283
1317
  identifier.get_unescaped_names(self.input_cols),
1284
1318
  identifier.get_unescaped_names(self.label_cols),
1285
1319
  identifier.get_unescaped_names(self.sample_weight_col),
1286
- statement_params=statement_params,
1320
+ statement_params,
1287
1321
  )
1288
1322
 
1289
1323
  cleanup_temp_files([local_score_file_name])
@@ -1301,18 +1335,20 @@ class HistGradientBoostingRegressor(BaseTransformer):
1301
1335
  if self._sklearn_object._estimator_type == 'classifier':
1302
1336
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1303
1337
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1304
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1338
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1339
+ ([] if self._drop_input_cols else inputs) + outputs)
1305
1340
  # For regressor, the type of predict is float64
1306
1341
  elif self._sklearn_object._estimator_type == 'regressor':
1307
1342
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1308
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1309
-
1343
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1344
+ ([] if self._drop_input_cols else inputs) + outputs)
1310
1345
  for prob_func in PROB_FUNCTIONS:
1311
1346
  if hasattr(self, prob_func):
1312
1347
  output_cols_prefix: str = f"{prob_func}_"
1313
1348
  output_column_names = self._get_output_column_names(output_cols_prefix)
1314
1349
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1315
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1350
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1351
+ ([] if self._drop_input_cols else inputs) + outputs)
1316
1352
 
1317
1353
  @property
1318
1354
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -244,7 +246,6 @@ class IsolationForest(BaseTransformer):
244
246
  sample_weight_col: Optional[str] = None,
245
247
  ) -> None:
246
248
  super().__init__()
247
- self.id = str(uuid4()).replace("-", "_").upper()
248
249
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
249
250
 
250
251
  self._deps = list(deps)
@@ -272,6 +273,15 @@ class IsolationForest(BaseTransformer):
272
273
  self.set_drop_input_cols(drop_input_cols)
273
274
  self.set_sample_weight_col(sample_weight_col)
274
275
 
276
+ def _get_rand_id(self) -> str:
277
+ """
278
+ Generate random id to be used in sproc and stage names.
279
+
280
+ Returns:
281
+ Random id string usable in sproc, table, and stage names.
282
+ """
283
+ return str(uuid4()).replace("-", "_").upper()
284
+
275
285
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
276
286
  """
277
287
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -350,7 +360,7 @@ class IsolationForest(BaseTransformer):
350
360
  cp.dump(self._sklearn_object, local_transform_file)
351
361
 
352
362
  # Create temp stage to run fit.
353
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
363
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
354
364
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
355
365
  SqlResultValidator(
356
366
  session=session,
@@ -363,11 +373,12 @@ class IsolationForest(BaseTransformer):
363
373
  expected_value=f"Stage area {transform_stage_name} successfully created."
364
374
  ).validate()
365
375
 
366
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
376
+ # Use posixpath to construct stage paths
377
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
378
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
367
379
  local_result_file_name = get_temp_file_path()
368
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
369
380
 
370
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
381
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
371
382
  statement_params = telemetry.get_function_usage_statement_params(
372
383
  project=_PROJECT,
373
384
  subproject=_SUBPROJECT,
@@ -393,6 +404,7 @@ class IsolationForest(BaseTransformer):
393
404
  replace=True,
394
405
  session=session,
395
406
  statement_params=statement_params,
407
+ anonymous=True
396
408
  )
397
409
  def fit_wrapper_sproc(
398
410
  session: Session,
@@ -401,7 +413,8 @@ class IsolationForest(BaseTransformer):
401
413
  stage_result_file_name: str,
402
414
  input_cols: List[str],
403
415
  label_cols: List[str],
404
- sample_weight_col: Optional[str]
416
+ sample_weight_col: Optional[str],
417
+ statement_params: Dict[str, str]
405
418
  ) -> str:
406
419
  import cloudpickle as cp
407
420
  import numpy as np
@@ -468,15 +481,15 @@ class IsolationForest(BaseTransformer):
468
481
  api_calls=[Session.call],
469
482
  custom_tags=dict([("autogen", True)]),
470
483
  )
471
- sproc_export_file_name = session.call(
472
- fit_sproc_name,
484
+ sproc_export_file_name = fit_wrapper_sproc(
485
+ session,
473
486
  query,
474
487
  stage_transform_file_name,
475
488
  stage_result_file_name,
476
489
  identifier.get_unescaped_names(self.input_cols),
477
490
  identifier.get_unescaped_names(self.label_cols),
478
491
  identifier.get_unescaped_names(self.sample_weight_col),
479
- statement_params=statement_params,
492
+ statement_params,
480
493
  )
481
494
 
482
495
  if "|" in sproc_export_file_name:
@@ -486,7 +499,7 @@ class IsolationForest(BaseTransformer):
486
499
  print("\n".join(fields[1:]))
487
500
 
488
501
  session.file.get(
489
- os.path.join(stage_result_file_name, sproc_export_file_name),
502
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
490
503
  local_result_file_name,
491
504
  statement_params=statement_params
492
505
  )
@@ -532,7 +545,7 @@ class IsolationForest(BaseTransformer):
532
545
 
533
546
  # Register vectorized UDF for batch inference
534
547
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
535
- safe_id=self.id, method=inference_method)
548
+ safe_id=self._get_rand_id(), method=inference_method)
536
549
 
537
550
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
538
551
  # will try to pickle all of self which fails.
@@ -624,7 +637,7 @@ class IsolationForest(BaseTransformer):
624
637
  return transformed_pandas_df.to_dict("records")
625
638
 
626
639
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
627
- safe_id=self.id
640
+ safe_id=self._get_rand_id()
628
641
  )
629
642
 
630
643
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -680,26 +693,37 @@ class IsolationForest(BaseTransformer):
680
693
  # input cols need to match unquoted / quoted
681
694
  input_cols = self.input_cols
682
695
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
696
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
683
697
 
684
698
  estimator = self._sklearn_object
685
699
 
686
- input_df = dataset[input_cols] # Select input columns with quoted column names.
687
- if hasattr(estimator, "feature_names_in_"):
688
- missing_features = []
689
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
690
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
691
- missing_features.append(f)
692
-
693
- if len(missing_features) > 0:
694
- raise ValueError(
695
- "The feature names should match with those that were passed during fit.\n"
696
- f"Features seen during fit call but not present in the input: {missing_features}\n"
697
- f"Features in the input dataframe : {input_cols}\n"
698
- )
699
- input_df.columns = getattr(estimator, "feature_names_in_")
700
- else:
701
- # Just rename the column names to unquoted identifiers.
702
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
700
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
701
+ missing_features = []
702
+ features_in_dataset = set(dataset.columns)
703
+ columns_to_select = []
704
+ for i, f in enumerate(features_required_by_estimator):
705
+ if (
706
+ i >= len(input_cols)
707
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
708
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
709
+ and quoted_input_cols[i] not in features_in_dataset)
710
+ ):
711
+ missing_features.append(f)
712
+ elif input_cols[i] in features_in_dataset:
713
+ columns_to_select.append(input_cols[i])
714
+ elif unquoted_input_cols[i] in features_in_dataset:
715
+ columns_to_select.append(unquoted_input_cols[i])
716
+ else:
717
+ columns_to_select.append(quoted_input_cols[i])
718
+
719
+ if len(missing_features) > 0:
720
+ raise ValueError(
721
+ "The feature names should match with those that were passed during fit.\n"
722
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
723
+ f"Features in the input dataframe : {input_cols}\n"
724
+ )
725
+ input_df = dataset[columns_to_select]
726
+ input_df.columns = features_required_by_estimator
703
727
 
704
728
  transformed_numpy_array = getattr(estimator, inference_method)(
705
729
  input_df
@@ -780,11 +804,18 @@ class IsolationForest(BaseTransformer):
780
804
  Transformed dataset.
781
805
  """
782
806
  if isinstance(dataset, DataFrame):
807
+ expected_type_inferred = ""
808
+ # when it is classifier, infer the datatype from label columns
809
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
810
+ expected_type_inferred = convert_sp_to_sf_type(
811
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
812
+ )
813
+
783
814
  output_df = self._batch_inference(
784
815
  dataset=dataset,
785
816
  inference_method="predict",
786
817
  expected_output_cols_list=self.output_cols,
787
- expected_output_cols_type="",
818
+ expected_output_cols_type=expected_type_inferred,
788
819
  )
789
820
  elif isinstance(dataset, pd.DataFrame):
790
821
  output_df = self._sklearn_inference(
@@ -855,10 +886,10 @@ class IsolationForest(BaseTransformer):
855
886
 
856
887
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
857
888
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
858
- Returns an empty list if current object is not a classifier or not yet fitted.
889
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
859
890
  """
860
891
  if getattr(self._sklearn_object, "classes_", None) is None:
861
- return []
892
+ return [output_cols_prefix]
862
893
 
863
894
  classes = self._sklearn_object.classes_
864
895
  if isinstance(classes, numpy.ndarray):
@@ -1085,7 +1116,7 @@ class IsolationForest(BaseTransformer):
1085
1116
  cp.dump(self._sklearn_object, local_score_file)
1086
1117
 
1087
1118
  # Create temp stage to run score.
1088
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1119
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1089
1120
  session = dataset._session
1090
1121
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1091
1122
  SqlResultValidator(
@@ -1099,8 +1130,9 @@ class IsolationForest(BaseTransformer):
1099
1130
  expected_value=f"Stage area {score_stage_name} successfully created."
1100
1131
  ).validate()
1101
1132
 
1102
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1103
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1133
+ # Use posixpath to construct stage paths
1134
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1135
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1104
1136
  statement_params = telemetry.get_function_usage_statement_params(
1105
1137
  project=_PROJECT,
1106
1138
  subproject=_SUBPROJECT,
@@ -1126,6 +1158,7 @@ class IsolationForest(BaseTransformer):
1126
1158
  replace=True,
1127
1159
  session=session,
1128
1160
  statement_params=statement_params,
1161
+ anonymous=True
1129
1162
  )
1130
1163
  def score_wrapper_sproc(
1131
1164
  session: Session,
@@ -1133,7 +1166,8 @@ class IsolationForest(BaseTransformer):
1133
1166
  stage_score_file_name: str,
1134
1167
  input_cols: List[str],
1135
1168
  label_cols: List[str],
1136
- sample_weight_col: Optional[str]
1169
+ sample_weight_col: Optional[str],
1170
+ statement_params: Dict[str, str]
1137
1171
  ) -> float:
1138
1172
  import cloudpickle as cp
1139
1173
  import numpy as np
@@ -1183,14 +1217,14 @@ class IsolationForest(BaseTransformer):
1183
1217
  api_calls=[Session.call],
1184
1218
  custom_tags=dict([("autogen", True)]),
1185
1219
  )
1186
- score = session.call(
1187
- score_sproc_name,
1220
+ score = score_wrapper_sproc(
1221
+ session,
1188
1222
  query,
1189
1223
  stage_score_file_name,
1190
1224
  identifier.get_unescaped_names(self.input_cols),
1191
1225
  identifier.get_unescaped_names(self.label_cols),
1192
1226
  identifier.get_unescaped_names(self.sample_weight_col),
1193
- statement_params=statement_params,
1227
+ statement_params,
1194
1228
  )
1195
1229
 
1196
1230
  cleanup_temp_files([local_score_file_name])
@@ -1208,18 +1242,20 @@ class IsolationForest(BaseTransformer):
1208
1242
  if self._sklearn_object._estimator_type == 'classifier':
1209
1243
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1210
1244
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1211
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1245
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1246
+ ([] if self._drop_input_cols else inputs) + outputs)
1212
1247
  # For regressor, the type of predict is float64
1213
1248
  elif self._sklearn_object._estimator_type == 'regressor':
1214
1249
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1215
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1216
-
1250
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1251
+ ([] if self._drop_input_cols else inputs) + outputs)
1217
1252
  for prob_func in PROB_FUNCTIONS:
1218
1253
  if hasattr(self, prob_func):
1219
1254
  output_cols_prefix: str = f"{prob_func}_"
1220
1255
  output_column_names = self._get_output_column_names(output_cols_prefix)
1221
1256
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1222
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1257
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1258
+ ([] if self._drop_input_cols else inputs) + outputs)
1223
1259
 
1224
1260
  @property
1225
1261
  def model_signatures(self) -> Dict[str, ModelSignature]: