snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -214,7 +216,6 @@ class TransformedTargetRegressor(BaseTransformer):
214
216
  sample_weight_col: Optional[str] = None,
215
217
  ) -> None:
216
218
  super().__init__()
217
- self.id = str(uuid4()).replace("-", "_").upper()
218
219
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
219
220
 
220
221
  self._deps = list(deps)
@@ -238,6 +239,15 @@ class TransformedTargetRegressor(BaseTransformer):
238
239
  self.set_drop_input_cols(drop_input_cols)
239
240
  self.set_sample_weight_col(sample_weight_col)
240
241
 
242
+ def _get_rand_id(self) -> str:
243
+ """
244
+ Generate random id to be used in sproc and stage names.
245
+
246
+ Returns:
247
+ Random id string usable in sproc, table, and stage names.
248
+ """
249
+ return str(uuid4()).replace("-", "_").upper()
250
+
241
251
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
242
252
  """
243
253
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -316,7 +326,7 @@ class TransformedTargetRegressor(BaseTransformer):
316
326
  cp.dump(self._sklearn_object, local_transform_file)
317
327
 
318
328
  # Create temp stage to run fit.
319
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
329
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
320
330
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
321
331
  SqlResultValidator(
322
332
  session=session,
@@ -329,11 +339,12 @@ class TransformedTargetRegressor(BaseTransformer):
329
339
  expected_value=f"Stage area {transform_stage_name} successfully created."
330
340
  ).validate()
331
341
 
332
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
342
+ # Use posixpath to construct stage paths
343
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
344
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
333
345
  local_result_file_name = get_temp_file_path()
334
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
335
346
 
336
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
347
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
337
348
  statement_params = telemetry.get_function_usage_statement_params(
338
349
  project=_PROJECT,
339
350
  subproject=_SUBPROJECT,
@@ -359,6 +370,7 @@ class TransformedTargetRegressor(BaseTransformer):
359
370
  replace=True,
360
371
  session=session,
361
372
  statement_params=statement_params,
373
+ anonymous=True
362
374
  )
363
375
  def fit_wrapper_sproc(
364
376
  session: Session,
@@ -367,7 +379,8 @@ class TransformedTargetRegressor(BaseTransformer):
367
379
  stage_result_file_name: str,
368
380
  input_cols: List[str],
369
381
  label_cols: List[str],
370
- sample_weight_col: Optional[str]
382
+ sample_weight_col: Optional[str],
383
+ statement_params: Dict[str, str]
371
384
  ) -> str:
372
385
  import cloudpickle as cp
373
386
  import numpy as np
@@ -434,15 +447,15 @@ class TransformedTargetRegressor(BaseTransformer):
434
447
  api_calls=[Session.call],
435
448
  custom_tags=dict([("autogen", True)]),
436
449
  )
437
- sproc_export_file_name = session.call(
438
- fit_sproc_name,
450
+ sproc_export_file_name = fit_wrapper_sproc(
451
+ session,
439
452
  query,
440
453
  stage_transform_file_name,
441
454
  stage_result_file_name,
442
455
  identifier.get_unescaped_names(self.input_cols),
443
456
  identifier.get_unescaped_names(self.label_cols),
444
457
  identifier.get_unescaped_names(self.sample_weight_col),
445
- statement_params=statement_params,
458
+ statement_params,
446
459
  )
447
460
 
448
461
  if "|" in sproc_export_file_name:
@@ -452,7 +465,7 @@ class TransformedTargetRegressor(BaseTransformer):
452
465
  print("\n".join(fields[1:]))
453
466
 
454
467
  session.file.get(
455
- os.path.join(stage_result_file_name, sproc_export_file_name),
468
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
456
469
  local_result_file_name,
457
470
  statement_params=statement_params
458
471
  )
@@ -498,7 +511,7 @@ class TransformedTargetRegressor(BaseTransformer):
498
511
 
499
512
  # Register vectorized UDF for batch inference
500
513
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
501
- safe_id=self.id, method=inference_method)
514
+ safe_id=self._get_rand_id(), method=inference_method)
502
515
 
503
516
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
504
517
  # will try to pickle all of self which fails.
@@ -590,7 +603,7 @@ class TransformedTargetRegressor(BaseTransformer):
590
603
  return transformed_pandas_df.to_dict("records")
591
604
 
592
605
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
593
- safe_id=self.id
606
+ safe_id=self._get_rand_id()
594
607
  )
595
608
 
596
609
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -646,26 +659,37 @@ class TransformedTargetRegressor(BaseTransformer):
646
659
  # input cols need to match unquoted / quoted
647
660
  input_cols = self.input_cols
648
661
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
662
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
649
663
 
650
664
  estimator = self._sklearn_object
651
665
 
652
- input_df = dataset[input_cols] # Select input columns with quoted column names.
653
- if hasattr(estimator, "feature_names_in_"):
654
- missing_features = []
655
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
656
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
657
- missing_features.append(f)
658
-
659
- if len(missing_features) > 0:
660
- raise ValueError(
661
- "The feature names should match with those that were passed during fit.\n"
662
- f"Features seen during fit call but not present in the input: {missing_features}\n"
663
- f"Features in the input dataframe : {input_cols}\n"
664
- )
665
- input_df.columns = getattr(estimator, "feature_names_in_")
666
- else:
667
- # Just rename the column names to unquoted identifiers.
668
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
666
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
667
+ missing_features = []
668
+ features_in_dataset = set(dataset.columns)
669
+ columns_to_select = []
670
+ for i, f in enumerate(features_required_by_estimator):
671
+ if (
672
+ i >= len(input_cols)
673
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
674
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
675
+ and quoted_input_cols[i] not in features_in_dataset)
676
+ ):
677
+ missing_features.append(f)
678
+ elif input_cols[i] in features_in_dataset:
679
+ columns_to_select.append(input_cols[i])
680
+ elif unquoted_input_cols[i] in features_in_dataset:
681
+ columns_to_select.append(unquoted_input_cols[i])
682
+ else:
683
+ columns_to_select.append(quoted_input_cols[i])
684
+
685
+ if len(missing_features) > 0:
686
+ raise ValueError(
687
+ "The feature names should match with those that were passed during fit.\n"
688
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
689
+ f"Features in the input dataframe : {input_cols}\n"
690
+ )
691
+ input_df = dataset[columns_to_select]
692
+ input_df.columns = features_required_by_estimator
669
693
 
670
694
  transformed_numpy_array = getattr(estimator, inference_method)(
671
695
  input_df
@@ -746,11 +770,18 @@ class TransformedTargetRegressor(BaseTransformer):
746
770
  Transformed dataset.
747
771
  """
748
772
  if isinstance(dataset, DataFrame):
773
+ expected_type_inferred = "float"
774
+ # when it is classifier, infer the datatype from label columns
775
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
776
+ expected_type_inferred = convert_sp_to_sf_type(
777
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
778
+ )
779
+
749
780
  output_df = self._batch_inference(
750
781
  dataset=dataset,
751
782
  inference_method="predict",
752
783
  expected_output_cols_list=self.output_cols,
753
- expected_output_cols_type="float",
784
+ expected_output_cols_type=expected_type_inferred,
754
785
  )
755
786
  elif isinstance(dataset, pd.DataFrame):
756
787
  output_df = self._sklearn_inference(
@@ -821,10 +852,10 @@ class TransformedTargetRegressor(BaseTransformer):
821
852
 
822
853
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
823
854
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
824
- Returns an empty list if current object is not a classifier or not yet fitted.
855
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
825
856
  """
826
857
  if getattr(self._sklearn_object, "classes_", None) is None:
827
- return []
858
+ return [output_cols_prefix]
828
859
 
829
860
  classes = self._sklearn_object.classes_
830
861
  if isinstance(classes, numpy.ndarray):
@@ -1049,7 +1080,7 @@ class TransformedTargetRegressor(BaseTransformer):
1049
1080
  cp.dump(self._sklearn_object, local_score_file)
1050
1081
 
1051
1082
  # Create temp stage to run score.
1052
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1083
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1053
1084
  session = dataset._session
1054
1085
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1055
1086
  SqlResultValidator(
@@ -1063,8 +1094,9 @@ class TransformedTargetRegressor(BaseTransformer):
1063
1094
  expected_value=f"Stage area {score_stage_name} successfully created."
1064
1095
  ).validate()
1065
1096
 
1066
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1067
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1097
+ # Use posixpath to construct stage paths
1098
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1099
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1068
1100
  statement_params = telemetry.get_function_usage_statement_params(
1069
1101
  project=_PROJECT,
1070
1102
  subproject=_SUBPROJECT,
@@ -1090,6 +1122,7 @@ class TransformedTargetRegressor(BaseTransformer):
1090
1122
  replace=True,
1091
1123
  session=session,
1092
1124
  statement_params=statement_params,
1125
+ anonymous=True
1093
1126
  )
1094
1127
  def score_wrapper_sproc(
1095
1128
  session: Session,
@@ -1097,7 +1130,8 @@ class TransformedTargetRegressor(BaseTransformer):
1097
1130
  stage_score_file_name: str,
1098
1131
  input_cols: List[str],
1099
1132
  label_cols: List[str],
1100
- sample_weight_col: Optional[str]
1133
+ sample_weight_col: Optional[str],
1134
+ statement_params: Dict[str, str]
1101
1135
  ) -> float:
1102
1136
  import cloudpickle as cp
1103
1137
  import numpy as np
@@ -1147,14 +1181,14 @@ class TransformedTargetRegressor(BaseTransformer):
1147
1181
  api_calls=[Session.call],
1148
1182
  custom_tags=dict([("autogen", True)]),
1149
1183
  )
1150
- score = session.call(
1151
- score_sproc_name,
1184
+ score = score_wrapper_sproc(
1185
+ session,
1152
1186
  query,
1153
1187
  stage_score_file_name,
1154
1188
  identifier.get_unescaped_names(self.input_cols),
1155
1189
  identifier.get_unescaped_names(self.label_cols),
1156
1190
  identifier.get_unescaped_names(self.sample_weight_col),
1157
- statement_params=statement_params,
1191
+ statement_params,
1158
1192
  )
1159
1193
 
1160
1194
  cleanup_temp_files([local_score_file_name])
@@ -1172,18 +1206,20 @@ class TransformedTargetRegressor(BaseTransformer):
1172
1206
  if self._sklearn_object._estimator_type == 'classifier':
1173
1207
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1174
1208
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1175
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1209
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1210
+ ([] if self._drop_input_cols else inputs) + outputs)
1176
1211
  # For regressor, the type of predict is float64
1177
1212
  elif self._sklearn_object._estimator_type == 'regressor':
1178
1213
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1179
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1180
-
1214
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1215
+ ([] if self._drop_input_cols else inputs) + outputs)
1181
1216
  for prob_func in PROB_FUNCTIONS:
1182
1217
  if hasattr(self, prob_func):
1183
1218
  output_cols_prefix: str = f"{prob_func}_"
1184
1219
  output_column_names = self._get_output_column_names(output_cols_prefix)
1185
1220
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1186
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1221
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1222
+ ([] if self._drop_input_cols else inputs) + outputs)
1187
1223
 
1188
1224
  @property
1189
1225
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -211,7 +213,6 @@ class EllipticEnvelope(BaseTransformer):
211
213
  sample_weight_col: Optional[str] = None,
212
214
  ) -> None:
213
215
  super().__init__()
214
- self.id = str(uuid4()).replace("-", "_").upper()
215
216
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
216
217
 
217
218
  self._deps = list(deps)
@@ -235,6 +236,15 @@ class EllipticEnvelope(BaseTransformer):
235
236
  self.set_drop_input_cols(drop_input_cols)
236
237
  self.set_sample_weight_col(sample_weight_col)
237
238
 
239
+ def _get_rand_id(self) -> str:
240
+ """
241
+ Generate random id to be used in sproc and stage names.
242
+
243
+ Returns:
244
+ Random id string usable in sproc, table, and stage names.
245
+ """
246
+ return str(uuid4()).replace("-", "_").upper()
247
+
238
248
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
239
249
  """
240
250
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -313,7 +323,7 @@ class EllipticEnvelope(BaseTransformer):
313
323
  cp.dump(self._sklearn_object, local_transform_file)
314
324
 
315
325
  # Create temp stage to run fit.
316
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
326
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
317
327
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
318
328
  SqlResultValidator(
319
329
  session=session,
@@ -326,11 +336,12 @@ class EllipticEnvelope(BaseTransformer):
326
336
  expected_value=f"Stage area {transform_stage_name} successfully created."
327
337
  ).validate()
328
338
 
329
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
339
+ # Use posixpath to construct stage paths
340
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
341
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
330
342
  local_result_file_name = get_temp_file_path()
331
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
332
343
 
333
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
344
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
334
345
  statement_params = telemetry.get_function_usage_statement_params(
335
346
  project=_PROJECT,
336
347
  subproject=_SUBPROJECT,
@@ -356,6 +367,7 @@ class EllipticEnvelope(BaseTransformer):
356
367
  replace=True,
357
368
  session=session,
358
369
  statement_params=statement_params,
370
+ anonymous=True
359
371
  )
360
372
  def fit_wrapper_sproc(
361
373
  session: Session,
@@ -364,7 +376,8 @@ class EllipticEnvelope(BaseTransformer):
364
376
  stage_result_file_name: str,
365
377
  input_cols: List[str],
366
378
  label_cols: List[str],
367
- sample_weight_col: Optional[str]
379
+ sample_weight_col: Optional[str],
380
+ statement_params: Dict[str, str]
368
381
  ) -> str:
369
382
  import cloudpickle as cp
370
383
  import numpy as np
@@ -431,15 +444,15 @@ class EllipticEnvelope(BaseTransformer):
431
444
  api_calls=[Session.call],
432
445
  custom_tags=dict([("autogen", True)]),
433
446
  )
434
- sproc_export_file_name = session.call(
435
- fit_sproc_name,
447
+ sproc_export_file_name = fit_wrapper_sproc(
448
+ session,
436
449
  query,
437
450
  stage_transform_file_name,
438
451
  stage_result_file_name,
439
452
  identifier.get_unescaped_names(self.input_cols),
440
453
  identifier.get_unescaped_names(self.label_cols),
441
454
  identifier.get_unescaped_names(self.sample_weight_col),
442
- statement_params=statement_params,
455
+ statement_params,
443
456
  )
444
457
 
445
458
  if "|" in sproc_export_file_name:
@@ -449,7 +462,7 @@ class EllipticEnvelope(BaseTransformer):
449
462
  print("\n".join(fields[1:]))
450
463
 
451
464
  session.file.get(
452
- os.path.join(stage_result_file_name, sproc_export_file_name),
465
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
453
466
  local_result_file_name,
454
467
  statement_params=statement_params
455
468
  )
@@ -495,7 +508,7 @@ class EllipticEnvelope(BaseTransformer):
495
508
 
496
509
  # Register vectorized UDF for batch inference
497
510
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
498
- safe_id=self.id, method=inference_method)
511
+ safe_id=self._get_rand_id(), method=inference_method)
499
512
 
500
513
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
501
514
  # will try to pickle all of self which fails.
@@ -587,7 +600,7 @@ class EllipticEnvelope(BaseTransformer):
587
600
  return transformed_pandas_df.to_dict("records")
588
601
 
589
602
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
590
- safe_id=self.id
603
+ safe_id=self._get_rand_id()
591
604
  )
592
605
 
593
606
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -643,26 +656,37 @@ class EllipticEnvelope(BaseTransformer):
643
656
  # input cols need to match unquoted / quoted
644
657
  input_cols = self.input_cols
645
658
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
659
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
646
660
 
647
661
  estimator = self._sklearn_object
648
662
 
649
- input_df = dataset[input_cols] # Select input columns with quoted column names.
650
- if hasattr(estimator, "feature_names_in_"):
651
- missing_features = []
652
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
653
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
654
- missing_features.append(f)
655
-
656
- if len(missing_features) > 0:
657
- raise ValueError(
658
- "The feature names should match with those that were passed during fit.\n"
659
- f"Features seen during fit call but not present in the input: {missing_features}\n"
660
- f"Features in the input dataframe : {input_cols}\n"
661
- )
662
- input_df.columns = getattr(estimator, "feature_names_in_")
663
- else:
664
- # Just rename the column names to unquoted identifiers.
665
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
663
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
664
+ missing_features = []
665
+ features_in_dataset = set(dataset.columns)
666
+ columns_to_select = []
667
+ for i, f in enumerate(features_required_by_estimator):
668
+ if (
669
+ i >= len(input_cols)
670
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
671
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
672
+ and quoted_input_cols[i] not in features_in_dataset)
673
+ ):
674
+ missing_features.append(f)
675
+ elif input_cols[i] in features_in_dataset:
676
+ columns_to_select.append(input_cols[i])
677
+ elif unquoted_input_cols[i] in features_in_dataset:
678
+ columns_to_select.append(unquoted_input_cols[i])
679
+ else:
680
+ columns_to_select.append(quoted_input_cols[i])
681
+
682
+ if len(missing_features) > 0:
683
+ raise ValueError(
684
+ "The feature names should match with those that were passed during fit.\n"
685
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
686
+ f"Features in the input dataframe : {input_cols}\n"
687
+ )
688
+ input_df = dataset[columns_to_select]
689
+ input_df.columns = features_required_by_estimator
666
690
 
667
691
  transformed_numpy_array = getattr(estimator, inference_method)(
668
692
  input_df
@@ -743,11 +767,18 @@ class EllipticEnvelope(BaseTransformer):
743
767
  Transformed dataset.
744
768
  """
745
769
  if isinstance(dataset, DataFrame):
770
+ expected_type_inferred = ""
771
+ # when it is classifier, infer the datatype from label columns
772
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
773
+ expected_type_inferred = convert_sp_to_sf_type(
774
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
775
+ )
776
+
746
777
  output_df = self._batch_inference(
747
778
  dataset=dataset,
748
779
  inference_method="predict",
749
780
  expected_output_cols_list=self.output_cols,
750
- expected_output_cols_type="",
781
+ expected_output_cols_type=expected_type_inferred,
751
782
  )
752
783
  elif isinstance(dataset, pd.DataFrame):
753
784
  output_df = self._sklearn_inference(
@@ -818,10 +849,10 @@ class EllipticEnvelope(BaseTransformer):
818
849
 
819
850
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
820
851
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
821
- Returns an empty list if current object is not a classifier or not yet fitted.
852
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
822
853
  """
823
854
  if getattr(self._sklearn_object, "classes_", None) is None:
824
- return []
855
+ return [output_cols_prefix]
825
856
 
826
857
  classes = self._sklearn_object.classes_
827
858
  if isinstance(classes, numpy.ndarray):
@@ -1048,7 +1079,7 @@ class EllipticEnvelope(BaseTransformer):
1048
1079
  cp.dump(self._sklearn_object, local_score_file)
1049
1080
 
1050
1081
  # Create temp stage to run score.
1051
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1082
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1052
1083
  session = dataset._session
1053
1084
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1054
1085
  SqlResultValidator(
@@ -1062,8 +1093,9 @@ class EllipticEnvelope(BaseTransformer):
1062
1093
  expected_value=f"Stage area {score_stage_name} successfully created."
1063
1094
  ).validate()
1064
1095
 
1065
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1066
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1096
+ # Use posixpath to construct stage paths
1097
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1098
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1067
1099
  statement_params = telemetry.get_function_usage_statement_params(
1068
1100
  project=_PROJECT,
1069
1101
  subproject=_SUBPROJECT,
@@ -1089,6 +1121,7 @@ class EllipticEnvelope(BaseTransformer):
1089
1121
  replace=True,
1090
1122
  session=session,
1091
1123
  statement_params=statement_params,
1124
+ anonymous=True
1092
1125
  )
1093
1126
  def score_wrapper_sproc(
1094
1127
  session: Session,
@@ -1096,7 +1129,8 @@ class EllipticEnvelope(BaseTransformer):
1096
1129
  stage_score_file_name: str,
1097
1130
  input_cols: List[str],
1098
1131
  label_cols: List[str],
1099
- sample_weight_col: Optional[str]
1132
+ sample_weight_col: Optional[str],
1133
+ statement_params: Dict[str, str]
1100
1134
  ) -> float:
1101
1135
  import cloudpickle as cp
1102
1136
  import numpy as np
@@ -1146,14 +1180,14 @@ class EllipticEnvelope(BaseTransformer):
1146
1180
  api_calls=[Session.call],
1147
1181
  custom_tags=dict([("autogen", True)]),
1148
1182
  )
1149
- score = session.call(
1150
- score_sproc_name,
1183
+ score = score_wrapper_sproc(
1184
+ session,
1151
1185
  query,
1152
1186
  stage_score_file_name,
1153
1187
  identifier.get_unescaped_names(self.input_cols),
1154
1188
  identifier.get_unescaped_names(self.label_cols),
1155
1189
  identifier.get_unescaped_names(self.sample_weight_col),
1156
- statement_params=statement_params,
1190
+ statement_params,
1157
1191
  )
1158
1192
 
1159
1193
  cleanup_temp_files([local_score_file_name])
@@ -1171,18 +1205,20 @@ class EllipticEnvelope(BaseTransformer):
1171
1205
  if self._sklearn_object._estimator_type == 'classifier':
1172
1206
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1173
1207
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1174
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1208
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1209
+ ([] if self._drop_input_cols else inputs) + outputs)
1175
1210
  # For regressor, the type of predict is float64
1176
1211
  elif self._sklearn_object._estimator_type == 'regressor':
1177
1212
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1178
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1179
-
1213
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1214
+ ([] if self._drop_input_cols else inputs) + outputs)
1180
1215
  for prob_func in PROB_FUNCTIONS:
1181
1216
  if hasattr(self, prob_func):
1182
1217
  output_cols_prefix: str = f"{prob_func}_"
1183
1218
  output_column_names = self._get_output_column_names(output_cols_prefix)
1184
1219
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1185
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1220
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1221
+ ([] if self._drop_input_cols else inputs) + outputs)
1186
1222
 
1187
1223
  @property
1188
1224
  def model_signatures(self) -> Dict[str, ModelSignature]: