snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -250,7 +252,6 @@ class BaggingRegressor(BaseTransformer):
250
252
  sample_weight_col: Optional[str] = None,
251
253
  ) -> None:
252
254
  super().__init__()
253
- self.id = str(uuid4()).replace("-", "_").upper()
254
255
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
255
256
  deps = deps | _gather_dependencies(estimator)
256
257
  deps = deps | _gather_dependencies(base_estimator)
@@ -283,6 +284,15 @@ class BaggingRegressor(BaseTransformer):
283
284
  self.set_drop_input_cols(drop_input_cols)
284
285
  self.set_sample_weight_col(sample_weight_col)
285
286
 
287
+ def _get_rand_id(self) -> str:
288
+ """
289
+ Generate random id to be used in sproc and stage names.
290
+
291
+ Returns:
292
+ Random id string usable in sproc, table, and stage names.
293
+ """
294
+ return str(uuid4()).replace("-", "_").upper()
295
+
286
296
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
287
297
  """
288
298
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -361,7 +371,7 @@ class BaggingRegressor(BaseTransformer):
361
371
  cp.dump(self._sklearn_object, local_transform_file)
362
372
 
363
373
  # Create temp stage to run fit.
364
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
374
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
365
375
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
366
376
  SqlResultValidator(
367
377
  session=session,
@@ -374,11 +384,12 @@ class BaggingRegressor(BaseTransformer):
374
384
  expected_value=f"Stage area {transform_stage_name} successfully created."
375
385
  ).validate()
376
386
 
377
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
387
+ # Use posixpath to construct stage paths
388
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
389
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
378
390
  local_result_file_name = get_temp_file_path()
379
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
380
391
 
381
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
392
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
382
393
  statement_params = telemetry.get_function_usage_statement_params(
383
394
  project=_PROJECT,
384
395
  subproject=_SUBPROJECT,
@@ -404,6 +415,7 @@ class BaggingRegressor(BaseTransformer):
404
415
  replace=True,
405
416
  session=session,
406
417
  statement_params=statement_params,
418
+ anonymous=True
407
419
  )
408
420
  def fit_wrapper_sproc(
409
421
  session: Session,
@@ -412,7 +424,8 @@ class BaggingRegressor(BaseTransformer):
412
424
  stage_result_file_name: str,
413
425
  input_cols: List[str],
414
426
  label_cols: List[str],
415
- sample_weight_col: Optional[str]
427
+ sample_weight_col: Optional[str],
428
+ statement_params: Dict[str, str]
416
429
  ) -> str:
417
430
  import cloudpickle as cp
418
431
  import numpy as np
@@ -479,15 +492,15 @@ class BaggingRegressor(BaseTransformer):
479
492
  api_calls=[Session.call],
480
493
  custom_tags=dict([("autogen", True)]),
481
494
  )
482
- sproc_export_file_name = session.call(
483
- fit_sproc_name,
495
+ sproc_export_file_name = fit_wrapper_sproc(
496
+ session,
484
497
  query,
485
498
  stage_transform_file_name,
486
499
  stage_result_file_name,
487
500
  identifier.get_unescaped_names(self.input_cols),
488
501
  identifier.get_unescaped_names(self.label_cols),
489
502
  identifier.get_unescaped_names(self.sample_weight_col),
490
- statement_params=statement_params,
503
+ statement_params,
491
504
  )
492
505
 
493
506
  if "|" in sproc_export_file_name:
@@ -497,7 +510,7 @@ class BaggingRegressor(BaseTransformer):
497
510
  print("\n".join(fields[1:]))
498
511
 
499
512
  session.file.get(
500
- os.path.join(stage_result_file_name, sproc_export_file_name),
513
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
501
514
  local_result_file_name,
502
515
  statement_params=statement_params
503
516
  )
@@ -543,7 +556,7 @@ class BaggingRegressor(BaseTransformer):
543
556
 
544
557
  # Register vectorized UDF for batch inference
545
558
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
546
- safe_id=self.id, method=inference_method)
559
+ safe_id=self._get_rand_id(), method=inference_method)
547
560
 
548
561
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
549
562
  # will try to pickle all of self which fails.
@@ -635,7 +648,7 @@ class BaggingRegressor(BaseTransformer):
635
648
  return transformed_pandas_df.to_dict("records")
636
649
 
637
650
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
638
- safe_id=self.id
651
+ safe_id=self._get_rand_id()
639
652
  )
640
653
 
641
654
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -691,26 +704,37 @@ class BaggingRegressor(BaseTransformer):
691
704
  # input cols need to match unquoted / quoted
692
705
  input_cols = self.input_cols
693
706
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
707
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
694
708
 
695
709
  estimator = self._sklearn_object
696
710
 
697
- input_df = dataset[input_cols] # Select input columns with quoted column names.
698
- if hasattr(estimator, "feature_names_in_"):
699
- missing_features = []
700
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
701
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
702
- missing_features.append(f)
703
-
704
- if len(missing_features) > 0:
705
- raise ValueError(
706
- "The feature names should match with those that were passed during fit.\n"
707
- f"Features seen during fit call but not present in the input: {missing_features}\n"
708
- f"Features in the input dataframe : {input_cols}\n"
709
- )
710
- input_df.columns = getattr(estimator, "feature_names_in_")
711
- else:
712
- # Just rename the column names to unquoted identifiers.
713
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
711
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
712
+ missing_features = []
713
+ features_in_dataset = set(dataset.columns)
714
+ columns_to_select = []
715
+ for i, f in enumerate(features_required_by_estimator):
716
+ if (
717
+ i >= len(input_cols)
718
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
719
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
720
+ and quoted_input_cols[i] not in features_in_dataset)
721
+ ):
722
+ missing_features.append(f)
723
+ elif input_cols[i] in features_in_dataset:
724
+ columns_to_select.append(input_cols[i])
725
+ elif unquoted_input_cols[i] in features_in_dataset:
726
+ columns_to_select.append(unquoted_input_cols[i])
727
+ else:
728
+ columns_to_select.append(quoted_input_cols[i])
729
+
730
+ if len(missing_features) > 0:
731
+ raise ValueError(
732
+ "The feature names should match with those that were passed during fit.\n"
733
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
734
+ f"Features in the input dataframe : {input_cols}\n"
735
+ )
736
+ input_df = dataset[columns_to_select]
737
+ input_df.columns = features_required_by_estimator
714
738
 
715
739
  transformed_numpy_array = getattr(estimator, inference_method)(
716
740
  input_df
@@ -791,11 +815,18 @@ class BaggingRegressor(BaseTransformer):
791
815
  Transformed dataset.
792
816
  """
793
817
  if isinstance(dataset, DataFrame):
818
+ expected_type_inferred = "float"
819
+ # when it is classifier, infer the datatype from label columns
820
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
821
+ expected_type_inferred = convert_sp_to_sf_type(
822
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
823
+ )
824
+
794
825
  output_df = self._batch_inference(
795
826
  dataset=dataset,
796
827
  inference_method="predict",
797
828
  expected_output_cols_list=self.output_cols,
798
- expected_output_cols_type="float",
829
+ expected_output_cols_type=expected_type_inferred,
799
830
  )
800
831
  elif isinstance(dataset, pd.DataFrame):
801
832
  output_df = self._sklearn_inference(
@@ -866,10 +897,10 @@ class BaggingRegressor(BaseTransformer):
866
897
 
867
898
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
868
899
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
869
- Returns an empty list if current object is not a classifier or not yet fitted.
900
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
870
901
  """
871
902
  if getattr(self._sklearn_object, "classes_", None) is None:
872
- return []
903
+ return [output_cols_prefix]
873
904
 
874
905
  classes = self._sklearn_object.classes_
875
906
  if isinstance(classes, numpy.ndarray):
@@ -1094,7 +1125,7 @@ class BaggingRegressor(BaseTransformer):
1094
1125
  cp.dump(self._sklearn_object, local_score_file)
1095
1126
 
1096
1127
  # Create temp stage to run score.
1097
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1128
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1098
1129
  session = dataset._session
1099
1130
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1100
1131
  SqlResultValidator(
@@ -1108,8 +1139,9 @@ class BaggingRegressor(BaseTransformer):
1108
1139
  expected_value=f"Stage area {score_stage_name} successfully created."
1109
1140
  ).validate()
1110
1141
 
1111
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1112
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1142
+ # Use posixpath to construct stage paths
1143
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1144
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1113
1145
  statement_params = telemetry.get_function_usage_statement_params(
1114
1146
  project=_PROJECT,
1115
1147
  subproject=_SUBPROJECT,
@@ -1135,6 +1167,7 @@ class BaggingRegressor(BaseTransformer):
1135
1167
  replace=True,
1136
1168
  session=session,
1137
1169
  statement_params=statement_params,
1170
+ anonymous=True
1138
1171
  )
1139
1172
  def score_wrapper_sproc(
1140
1173
  session: Session,
@@ -1142,7 +1175,8 @@ class BaggingRegressor(BaseTransformer):
1142
1175
  stage_score_file_name: str,
1143
1176
  input_cols: List[str],
1144
1177
  label_cols: List[str],
1145
- sample_weight_col: Optional[str]
1178
+ sample_weight_col: Optional[str],
1179
+ statement_params: Dict[str, str]
1146
1180
  ) -> float:
1147
1181
  import cloudpickle as cp
1148
1182
  import numpy as np
@@ -1192,14 +1226,14 @@ class BaggingRegressor(BaseTransformer):
1192
1226
  api_calls=[Session.call],
1193
1227
  custom_tags=dict([("autogen", True)]),
1194
1228
  )
1195
- score = session.call(
1196
- score_sproc_name,
1229
+ score = score_wrapper_sproc(
1230
+ session,
1197
1231
  query,
1198
1232
  stage_score_file_name,
1199
1233
  identifier.get_unescaped_names(self.input_cols),
1200
1234
  identifier.get_unescaped_names(self.label_cols),
1201
1235
  identifier.get_unescaped_names(self.sample_weight_col),
1202
- statement_params=statement_params,
1236
+ statement_params,
1203
1237
  )
1204
1238
 
1205
1239
  cleanup_temp_files([local_score_file_name])
@@ -1217,18 +1251,20 @@ class BaggingRegressor(BaseTransformer):
1217
1251
  if self._sklearn_object._estimator_type == 'classifier':
1218
1252
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1219
1253
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1220
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1254
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1255
+ ([] if self._drop_input_cols else inputs) + outputs)
1221
1256
  # For regressor, the type of predict is float64
1222
1257
  elif self._sklearn_object._estimator_type == 'regressor':
1223
1258
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1224
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1225
-
1259
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1260
+ ([] if self._drop_input_cols else inputs) + outputs)
1226
1261
  for prob_func in PROB_FUNCTIONS:
1227
1262
  if hasattr(self, prob_func):
1228
1263
  output_cols_prefix: str = f"{prob_func}_"
1229
1264
  output_column_names = self._get_output_column_names(output_cols_prefix)
1230
1265
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1231
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1266
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1267
+ ([] if self._drop_input_cols else inputs) + outputs)
1232
1268
 
1233
1269
  @property
1234
1270
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -348,7 +350,6 @@ class ExtraTreesClassifier(BaseTransformer):
348
350
  sample_weight_col: Optional[str] = None,
349
351
  ) -> None:
350
352
  super().__init__()
351
- self.id = str(uuid4()).replace("-", "_").upper()
352
353
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
353
354
 
354
355
  self._deps = list(deps)
@@ -385,6 +386,15 @@ class ExtraTreesClassifier(BaseTransformer):
385
386
  self.set_drop_input_cols(drop_input_cols)
386
387
  self.set_sample_weight_col(sample_weight_col)
387
388
 
389
+ def _get_rand_id(self) -> str:
390
+ """
391
+ Generate random id to be used in sproc and stage names.
392
+
393
+ Returns:
394
+ Random id string usable in sproc, table, and stage names.
395
+ """
396
+ return str(uuid4()).replace("-", "_").upper()
397
+
388
398
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
389
399
  """
390
400
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -463,7 +473,7 @@ class ExtraTreesClassifier(BaseTransformer):
463
473
  cp.dump(self._sklearn_object, local_transform_file)
464
474
 
465
475
  # Create temp stage to run fit.
466
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
476
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
467
477
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
468
478
  SqlResultValidator(
469
479
  session=session,
@@ -476,11 +486,12 @@ class ExtraTreesClassifier(BaseTransformer):
476
486
  expected_value=f"Stage area {transform_stage_name} successfully created."
477
487
  ).validate()
478
488
 
479
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
489
+ # Use posixpath to construct stage paths
490
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
491
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
480
492
  local_result_file_name = get_temp_file_path()
481
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
482
493
 
483
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
494
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
484
495
  statement_params = telemetry.get_function_usage_statement_params(
485
496
  project=_PROJECT,
486
497
  subproject=_SUBPROJECT,
@@ -506,6 +517,7 @@ class ExtraTreesClassifier(BaseTransformer):
506
517
  replace=True,
507
518
  session=session,
508
519
  statement_params=statement_params,
520
+ anonymous=True
509
521
  )
510
522
  def fit_wrapper_sproc(
511
523
  session: Session,
@@ -514,7 +526,8 @@ class ExtraTreesClassifier(BaseTransformer):
514
526
  stage_result_file_name: str,
515
527
  input_cols: List[str],
516
528
  label_cols: List[str],
517
- sample_weight_col: Optional[str]
529
+ sample_weight_col: Optional[str],
530
+ statement_params: Dict[str, str]
518
531
  ) -> str:
519
532
  import cloudpickle as cp
520
533
  import numpy as np
@@ -581,15 +594,15 @@ class ExtraTreesClassifier(BaseTransformer):
581
594
  api_calls=[Session.call],
582
595
  custom_tags=dict([("autogen", True)]),
583
596
  )
584
- sproc_export_file_name = session.call(
585
- fit_sproc_name,
597
+ sproc_export_file_name = fit_wrapper_sproc(
598
+ session,
586
599
  query,
587
600
  stage_transform_file_name,
588
601
  stage_result_file_name,
589
602
  identifier.get_unescaped_names(self.input_cols),
590
603
  identifier.get_unescaped_names(self.label_cols),
591
604
  identifier.get_unescaped_names(self.sample_weight_col),
592
- statement_params=statement_params,
605
+ statement_params,
593
606
  )
594
607
 
595
608
  if "|" in sproc_export_file_name:
@@ -599,7 +612,7 @@ class ExtraTreesClassifier(BaseTransformer):
599
612
  print("\n".join(fields[1:]))
600
613
 
601
614
  session.file.get(
602
- os.path.join(stage_result_file_name, sproc_export_file_name),
615
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
603
616
  local_result_file_name,
604
617
  statement_params=statement_params
605
618
  )
@@ -645,7 +658,7 @@ class ExtraTreesClassifier(BaseTransformer):
645
658
 
646
659
  # Register vectorized UDF for batch inference
647
660
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
648
- safe_id=self.id, method=inference_method)
661
+ safe_id=self._get_rand_id(), method=inference_method)
649
662
 
650
663
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
651
664
  # will try to pickle all of self which fails.
@@ -737,7 +750,7 @@ class ExtraTreesClassifier(BaseTransformer):
737
750
  return transformed_pandas_df.to_dict("records")
738
751
 
739
752
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
740
- safe_id=self.id
753
+ safe_id=self._get_rand_id()
741
754
  )
742
755
 
743
756
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -793,26 +806,37 @@ class ExtraTreesClassifier(BaseTransformer):
793
806
  # input cols need to match unquoted / quoted
794
807
  input_cols = self.input_cols
795
808
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
809
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
796
810
 
797
811
  estimator = self._sklearn_object
798
812
 
799
- input_df = dataset[input_cols] # Select input columns with quoted column names.
800
- if hasattr(estimator, "feature_names_in_"):
801
- missing_features = []
802
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
803
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
804
- missing_features.append(f)
805
-
806
- if len(missing_features) > 0:
807
- raise ValueError(
808
- "The feature names should match with those that were passed during fit.\n"
809
- f"Features seen during fit call but not present in the input: {missing_features}\n"
810
- f"Features in the input dataframe : {input_cols}\n"
811
- )
812
- input_df.columns = getattr(estimator, "feature_names_in_")
813
- else:
814
- # Just rename the column names to unquoted identifiers.
815
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
813
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
814
+ missing_features = []
815
+ features_in_dataset = set(dataset.columns)
816
+ columns_to_select = []
817
+ for i, f in enumerate(features_required_by_estimator):
818
+ if (
819
+ i >= len(input_cols)
820
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
821
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
822
+ and quoted_input_cols[i] not in features_in_dataset)
823
+ ):
824
+ missing_features.append(f)
825
+ elif input_cols[i] in features_in_dataset:
826
+ columns_to_select.append(input_cols[i])
827
+ elif unquoted_input_cols[i] in features_in_dataset:
828
+ columns_to_select.append(unquoted_input_cols[i])
829
+ else:
830
+ columns_to_select.append(quoted_input_cols[i])
831
+
832
+ if len(missing_features) > 0:
833
+ raise ValueError(
834
+ "The feature names should match with those that were passed during fit.\n"
835
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
836
+ f"Features in the input dataframe : {input_cols}\n"
837
+ )
838
+ input_df = dataset[columns_to_select]
839
+ input_df.columns = features_required_by_estimator
816
840
 
817
841
  transformed_numpy_array = getattr(estimator, inference_method)(
818
842
  input_df
@@ -893,11 +917,18 @@ class ExtraTreesClassifier(BaseTransformer):
893
917
  Transformed dataset.
894
918
  """
895
919
  if isinstance(dataset, DataFrame):
920
+ expected_type_inferred = ""
921
+ # when it is classifier, infer the datatype from label columns
922
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
923
+ expected_type_inferred = convert_sp_to_sf_type(
924
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
925
+ )
926
+
896
927
  output_df = self._batch_inference(
897
928
  dataset=dataset,
898
929
  inference_method="predict",
899
930
  expected_output_cols_list=self.output_cols,
900
- expected_output_cols_type="",
931
+ expected_output_cols_type=expected_type_inferred,
901
932
  )
902
933
  elif isinstance(dataset, pd.DataFrame):
903
934
  output_df = self._sklearn_inference(
@@ -968,10 +999,10 @@ class ExtraTreesClassifier(BaseTransformer):
968
999
 
969
1000
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
970
1001
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
971
- Returns an empty list if current object is not a classifier or not yet fitted.
1002
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
972
1003
  """
973
1004
  if getattr(self._sklearn_object, "classes_", None) is None:
974
- return []
1005
+ return [output_cols_prefix]
975
1006
 
976
1007
  classes = self._sklearn_object.classes_
977
1008
  if isinstance(classes, numpy.ndarray):
@@ -1200,7 +1231,7 @@ class ExtraTreesClassifier(BaseTransformer):
1200
1231
  cp.dump(self._sklearn_object, local_score_file)
1201
1232
 
1202
1233
  # Create temp stage to run score.
1203
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1234
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1204
1235
  session = dataset._session
1205
1236
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1206
1237
  SqlResultValidator(
@@ -1214,8 +1245,9 @@ class ExtraTreesClassifier(BaseTransformer):
1214
1245
  expected_value=f"Stage area {score_stage_name} successfully created."
1215
1246
  ).validate()
1216
1247
 
1217
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1218
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1248
+ # Use posixpath to construct stage paths
1249
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1250
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1219
1251
  statement_params = telemetry.get_function_usage_statement_params(
1220
1252
  project=_PROJECT,
1221
1253
  subproject=_SUBPROJECT,
@@ -1241,6 +1273,7 @@ class ExtraTreesClassifier(BaseTransformer):
1241
1273
  replace=True,
1242
1274
  session=session,
1243
1275
  statement_params=statement_params,
1276
+ anonymous=True
1244
1277
  )
1245
1278
  def score_wrapper_sproc(
1246
1279
  session: Session,
@@ -1248,7 +1281,8 @@ class ExtraTreesClassifier(BaseTransformer):
1248
1281
  stage_score_file_name: str,
1249
1282
  input_cols: List[str],
1250
1283
  label_cols: List[str],
1251
- sample_weight_col: Optional[str]
1284
+ sample_weight_col: Optional[str],
1285
+ statement_params: Dict[str, str]
1252
1286
  ) -> float:
1253
1287
  import cloudpickle as cp
1254
1288
  import numpy as np
@@ -1298,14 +1332,14 @@ class ExtraTreesClassifier(BaseTransformer):
1298
1332
  api_calls=[Session.call],
1299
1333
  custom_tags=dict([("autogen", True)]),
1300
1334
  )
1301
- score = session.call(
1302
- score_sproc_name,
1335
+ score = score_wrapper_sproc(
1336
+ session,
1303
1337
  query,
1304
1338
  stage_score_file_name,
1305
1339
  identifier.get_unescaped_names(self.input_cols),
1306
1340
  identifier.get_unescaped_names(self.label_cols),
1307
1341
  identifier.get_unescaped_names(self.sample_weight_col),
1308
- statement_params=statement_params,
1342
+ statement_params,
1309
1343
  )
1310
1344
 
1311
1345
  cleanup_temp_files([local_score_file_name])
@@ -1323,18 +1357,20 @@ class ExtraTreesClassifier(BaseTransformer):
1323
1357
  if self._sklearn_object._estimator_type == 'classifier':
1324
1358
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1325
1359
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1326
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1360
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1361
+ ([] if self._drop_input_cols else inputs) + outputs)
1327
1362
  # For regressor, the type of predict is float64
1328
1363
  elif self._sklearn_object._estimator_type == 'regressor':
1329
1364
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1330
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1331
-
1365
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1366
+ ([] if self._drop_input_cols else inputs) + outputs)
1332
1367
  for prob_func in PROB_FUNCTIONS:
1333
1368
  if hasattr(self, prob_func):
1334
1369
  output_cols_prefix: str = f"{prob_func}_"
1335
1370
  output_column_names = self._get_output_column_names(output_cols_prefix)
1336
1371
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1337
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1372
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1373
+ ([] if self._drop_input_cols else inputs) + outputs)
1338
1374
 
1339
1375
  @property
1340
1376
  def model_signatures(self) -> Dict[str, ModelSignature]: