snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -230,7 +232,6 @@ class GammaRegressor(BaseTransformer):
230
232
  sample_weight_col: Optional[str] = None,
231
233
  ) -> None:
232
234
  super().__init__()
233
- self.id = str(uuid4()).replace("-", "_").upper()
234
235
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
235
236
 
236
237
  self._deps = list(deps)
@@ -256,6 +257,15 @@ class GammaRegressor(BaseTransformer):
256
257
  self.set_drop_input_cols(drop_input_cols)
257
258
  self.set_sample_weight_col(sample_weight_col)
258
259
 
260
+ def _get_rand_id(self) -> str:
261
+ """
262
+ Generate random id to be used in sproc and stage names.
263
+
264
+ Returns:
265
+ Random id string usable in sproc, table, and stage names.
266
+ """
267
+ return str(uuid4()).replace("-", "_").upper()
268
+
259
269
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
260
270
  """
261
271
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -334,7 +344,7 @@ class GammaRegressor(BaseTransformer):
334
344
  cp.dump(self._sklearn_object, local_transform_file)
335
345
 
336
346
  # Create temp stage to run fit.
337
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
347
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
338
348
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
339
349
  SqlResultValidator(
340
350
  session=session,
@@ -347,11 +357,12 @@ class GammaRegressor(BaseTransformer):
347
357
  expected_value=f"Stage area {transform_stage_name} successfully created."
348
358
  ).validate()
349
359
 
350
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
360
+ # Use posixpath to construct stage paths
361
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
362
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
351
363
  local_result_file_name = get_temp_file_path()
352
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
353
364
 
354
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
365
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
355
366
  statement_params = telemetry.get_function_usage_statement_params(
356
367
  project=_PROJECT,
357
368
  subproject=_SUBPROJECT,
@@ -377,6 +388,7 @@ class GammaRegressor(BaseTransformer):
377
388
  replace=True,
378
389
  session=session,
379
390
  statement_params=statement_params,
391
+ anonymous=True
380
392
  )
381
393
  def fit_wrapper_sproc(
382
394
  session: Session,
@@ -385,7 +397,8 @@ class GammaRegressor(BaseTransformer):
385
397
  stage_result_file_name: str,
386
398
  input_cols: List[str],
387
399
  label_cols: List[str],
388
- sample_weight_col: Optional[str]
400
+ sample_weight_col: Optional[str],
401
+ statement_params: Dict[str, str]
389
402
  ) -> str:
390
403
  import cloudpickle as cp
391
404
  import numpy as np
@@ -452,15 +465,15 @@ class GammaRegressor(BaseTransformer):
452
465
  api_calls=[Session.call],
453
466
  custom_tags=dict([("autogen", True)]),
454
467
  )
455
- sproc_export_file_name = session.call(
456
- fit_sproc_name,
468
+ sproc_export_file_name = fit_wrapper_sproc(
469
+ session,
457
470
  query,
458
471
  stage_transform_file_name,
459
472
  stage_result_file_name,
460
473
  identifier.get_unescaped_names(self.input_cols),
461
474
  identifier.get_unescaped_names(self.label_cols),
462
475
  identifier.get_unescaped_names(self.sample_weight_col),
463
- statement_params=statement_params,
476
+ statement_params,
464
477
  )
465
478
 
466
479
  if "|" in sproc_export_file_name:
@@ -470,7 +483,7 @@ class GammaRegressor(BaseTransformer):
470
483
  print("\n".join(fields[1:]))
471
484
 
472
485
  session.file.get(
473
- os.path.join(stage_result_file_name, sproc_export_file_name),
486
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
474
487
  local_result_file_name,
475
488
  statement_params=statement_params
476
489
  )
@@ -516,7 +529,7 @@ class GammaRegressor(BaseTransformer):
516
529
 
517
530
  # Register vectorized UDF for batch inference
518
531
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
519
- safe_id=self.id, method=inference_method)
532
+ safe_id=self._get_rand_id(), method=inference_method)
520
533
 
521
534
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
522
535
  # will try to pickle all of self which fails.
@@ -608,7 +621,7 @@ class GammaRegressor(BaseTransformer):
608
621
  return transformed_pandas_df.to_dict("records")
609
622
 
610
623
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
611
- safe_id=self.id
624
+ safe_id=self._get_rand_id()
612
625
  )
613
626
 
614
627
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -664,26 +677,37 @@ class GammaRegressor(BaseTransformer):
664
677
  # input cols need to match unquoted / quoted
665
678
  input_cols = self.input_cols
666
679
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
680
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
667
681
 
668
682
  estimator = self._sklearn_object
669
683
 
670
- input_df = dataset[input_cols] # Select input columns with quoted column names.
671
- if hasattr(estimator, "feature_names_in_"):
672
- missing_features = []
673
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
674
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
675
- missing_features.append(f)
676
-
677
- if len(missing_features) > 0:
678
- raise ValueError(
679
- "The feature names should match with those that were passed during fit.\n"
680
- f"Features seen during fit call but not present in the input: {missing_features}\n"
681
- f"Features in the input dataframe : {input_cols}\n"
682
- )
683
- input_df.columns = getattr(estimator, "feature_names_in_")
684
- else:
685
- # Just rename the column names to unquoted identifiers.
686
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
684
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
685
+ missing_features = []
686
+ features_in_dataset = set(dataset.columns)
687
+ columns_to_select = []
688
+ for i, f in enumerate(features_required_by_estimator):
689
+ if (
690
+ i >= len(input_cols)
691
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
692
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
693
+ and quoted_input_cols[i] not in features_in_dataset)
694
+ ):
695
+ missing_features.append(f)
696
+ elif input_cols[i] in features_in_dataset:
697
+ columns_to_select.append(input_cols[i])
698
+ elif unquoted_input_cols[i] in features_in_dataset:
699
+ columns_to_select.append(unquoted_input_cols[i])
700
+ else:
701
+ columns_to_select.append(quoted_input_cols[i])
702
+
703
+ if len(missing_features) > 0:
704
+ raise ValueError(
705
+ "The feature names should match with those that were passed during fit.\n"
706
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
707
+ f"Features in the input dataframe : {input_cols}\n"
708
+ )
709
+ input_df = dataset[columns_to_select]
710
+ input_df.columns = features_required_by_estimator
687
711
 
688
712
  transformed_numpy_array = getattr(estimator, inference_method)(
689
713
  input_df
@@ -764,11 +788,18 @@ class GammaRegressor(BaseTransformer):
764
788
  Transformed dataset.
765
789
  """
766
790
  if isinstance(dataset, DataFrame):
791
+ expected_type_inferred = "float"
792
+ # when it is classifier, infer the datatype from label columns
793
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
794
+ expected_type_inferred = convert_sp_to_sf_type(
795
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
796
+ )
797
+
767
798
  output_df = self._batch_inference(
768
799
  dataset=dataset,
769
800
  inference_method="predict",
770
801
  expected_output_cols_list=self.output_cols,
771
- expected_output_cols_type="float",
802
+ expected_output_cols_type=expected_type_inferred,
772
803
  )
773
804
  elif isinstance(dataset, pd.DataFrame):
774
805
  output_df = self._sklearn_inference(
@@ -839,10 +870,10 @@ class GammaRegressor(BaseTransformer):
839
870
 
840
871
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
841
872
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
842
- Returns an empty list if current object is not a classifier or not yet fitted.
873
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
843
874
  """
844
875
  if getattr(self._sklearn_object, "classes_", None) is None:
845
- return []
876
+ return [output_cols_prefix]
846
877
 
847
878
  classes = self._sklearn_object.classes_
848
879
  if isinstance(classes, numpy.ndarray):
@@ -1067,7 +1098,7 @@ class GammaRegressor(BaseTransformer):
1067
1098
  cp.dump(self._sklearn_object, local_score_file)
1068
1099
 
1069
1100
  # Create temp stage to run score.
1070
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1101
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1071
1102
  session = dataset._session
1072
1103
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1073
1104
  SqlResultValidator(
@@ -1081,8 +1112,9 @@ class GammaRegressor(BaseTransformer):
1081
1112
  expected_value=f"Stage area {score_stage_name} successfully created."
1082
1113
  ).validate()
1083
1114
 
1084
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1085
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1115
+ # Use posixpath to construct stage paths
1116
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1117
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1086
1118
  statement_params = telemetry.get_function_usage_statement_params(
1087
1119
  project=_PROJECT,
1088
1120
  subproject=_SUBPROJECT,
@@ -1108,6 +1140,7 @@ class GammaRegressor(BaseTransformer):
1108
1140
  replace=True,
1109
1141
  session=session,
1110
1142
  statement_params=statement_params,
1143
+ anonymous=True
1111
1144
  )
1112
1145
  def score_wrapper_sproc(
1113
1146
  session: Session,
@@ -1115,7 +1148,8 @@ class GammaRegressor(BaseTransformer):
1115
1148
  stage_score_file_name: str,
1116
1149
  input_cols: List[str],
1117
1150
  label_cols: List[str],
1118
- sample_weight_col: Optional[str]
1151
+ sample_weight_col: Optional[str],
1152
+ statement_params: Dict[str, str]
1119
1153
  ) -> float:
1120
1154
  import cloudpickle as cp
1121
1155
  import numpy as np
@@ -1165,14 +1199,14 @@ class GammaRegressor(BaseTransformer):
1165
1199
  api_calls=[Session.call],
1166
1200
  custom_tags=dict([("autogen", True)]),
1167
1201
  )
1168
- score = session.call(
1169
- score_sproc_name,
1202
+ score = score_wrapper_sproc(
1203
+ session,
1170
1204
  query,
1171
1205
  stage_score_file_name,
1172
1206
  identifier.get_unescaped_names(self.input_cols),
1173
1207
  identifier.get_unescaped_names(self.label_cols),
1174
1208
  identifier.get_unescaped_names(self.sample_weight_col),
1175
- statement_params=statement_params,
1209
+ statement_params,
1176
1210
  )
1177
1211
 
1178
1212
  cleanup_temp_files([local_score_file_name])
@@ -1190,18 +1224,20 @@ class GammaRegressor(BaseTransformer):
1190
1224
  if self._sklearn_object._estimator_type == 'classifier':
1191
1225
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1192
1226
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1193
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1227
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1228
+ ([] if self._drop_input_cols else inputs) + outputs)
1194
1229
  # For regressor, the type of predict is float64
1195
1230
  elif self._sklearn_object._estimator_type == 'regressor':
1196
1231
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1197
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1198
-
1232
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1233
+ ([] if self._drop_input_cols else inputs) + outputs)
1199
1234
  for prob_func in PROB_FUNCTIONS:
1200
1235
  if hasattr(self, prob_func):
1201
1236
  output_cols_prefix: str = f"{prob_func}_"
1202
1237
  output_column_names = self._get_output_column_names(output_cols_prefix)
1203
1238
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1204
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1239
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1240
+ ([] if self._drop_input_cols else inputs) + outputs)
1205
1241
 
1206
1242
  @property
1207
1243
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -214,7 +216,6 @@ class HuberRegressor(BaseTransformer):
214
216
  sample_weight_col: Optional[str] = None,
215
217
  ) -> None:
216
218
  super().__init__()
217
- self.id = str(uuid4()).replace("-", "_").upper()
218
219
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
219
220
 
220
221
  self._deps = list(deps)
@@ -239,6 +240,15 @@ class HuberRegressor(BaseTransformer):
239
240
  self.set_drop_input_cols(drop_input_cols)
240
241
  self.set_sample_weight_col(sample_weight_col)
241
242
 
243
+ def _get_rand_id(self) -> str:
244
+ """
245
+ Generate random id to be used in sproc and stage names.
246
+
247
+ Returns:
248
+ Random id string usable in sproc, table, and stage names.
249
+ """
250
+ return str(uuid4()).replace("-", "_").upper()
251
+
242
252
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
243
253
  """
244
254
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -317,7 +327,7 @@ class HuberRegressor(BaseTransformer):
317
327
  cp.dump(self._sklearn_object, local_transform_file)
318
328
 
319
329
  # Create temp stage to run fit.
320
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
330
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
321
331
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
322
332
  SqlResultValidator(
323
333
  session=session,
@@ -330,11 +340,12 @@ class HuberRegressor(BaseTransformer):
330
340
  expected_value=f"Stage area {transform_stage_name} successfully created."
331
341
  ).validate()
332
342
 
333
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
343
+ # Use posixpath to construct stage paths
344
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
345
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
334
346
  local_result_file_name = get_temp_file_path()
335
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
336
347
 
337
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
348
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
338
349
  statement_params = telemetry.get_function_usage_statement_params(
339
350
  project=_PROJECT,
340
351
  subproject=_SUBPROJECT,
@@ -360,6 +371,7 @@ class HuberRegressor(BaseTransformer):
360
371
  replace=True,
361
372
  session=session,
362
373
  statement_params=statement_params,
374
+ anonymous=True
363
375
  )
364
376
  def fit_wrapper_sproc(
365
377
  session: Session,
@@ -368,7 +380,8 @@ class HuberRegressor(BaseTransformer):
368
380
  stage_result_file_name: str,
369
381
  input_cols: List[str],
370
382
  label_cols: List[str],
371
- sample_weight_col: Optional[str]
383
+ sample_weight_col: Optional[str],
384
+ statement_params: Dict[str, str]
372
385
  ) -> str:
373
386
  import cloudpickle as cp
374
387
  import numpy as np
@@ -435,15 +448,15 @@ class HuberRegressor(BaseTransformer):
435
448
  api_calls=[Session.call],
436
449
  custom_tags=dict([("autogen", True)]),
437
450
  )
438
- sproc_export_file_name = session.call(
439
- fit_sproc_name,
451
+ sproc_export_file_name = fit_wrapper_sproc(
452
+ session,
440
453
  query,
441
454
  stage_transform_file_name,
442
455
  stage_result_file_name,
443
456
  identifier.get_unescaped_names(self.input_cols),
444
457
  identifier.get_unescaped_names(self.label_cols),
445
458
  identifier.get_unescaped_names(self.sample_weight_col),
446
- statement_params=statement_params,
459
+ statement_params,
447
460
  )
448
461
 
449
462
  if "|" in sproc_export_file_name:
@@ -453,7 +466,7 @@ class HuberRegressor(BaseTransformer):
453
466
  print("\n".join(fields[1:]))
454
467
 
455
468
  session.file.get(
456
- os.path.join(stage_result_file_name, sproc_export_file_name),
469
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
457
470
  local_result_file_name,
458
471
  statement_params=statement_params
459
472
  )
@@ -499,7 +512,7 @@ class HuberRegressor(BaseTransformer):
499
512
 
500
513
  # Register vectorized UDF for batch inference
501
514
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
502
- safe_id=self.id, method=inference_method)
515
+ safe_id=self._get_rand_id(), method=inference_method)
503
516
 
504
517
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
505
518
  # will try to pickle all of self which fails.
@@ -591,7 +604,7 @@ class HuberRegressor(BaseTransformer):
591
604
  return transformed_pandas_df.to_dict("records")
592
605
 
593
606
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
594
- safe_id=self.id
607
+ safe_id=self._get_rand_id()
595
608
  )
596
609
 
597
610
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -647,26 +660,37 @@ class HuberRegressor(BaseTransformer):
647
660
  # input cols need to match unquoted / quoted
648
661
  input_cols = self.input_cols
649
662
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
663
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
650
664
 
651
665
  estimator = self._sklearn_object
652
666
 
653
- input_df = dataset[input_cols] # Select input columns with quoted column names.
654
- if hasattr(estimator, "feature_names_in_"):
655
- missing_features = []
656
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
657
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
658
- missing_features.append(f)
659
-
660
- if len(missing_features) > 0:
661
- raise ValueError(
662
- "The feature names should match with those that were passed during fit.\n"
663
- f"Features seen during fit call but not present in the input: {missing_features}\n"
664
- f"Features in the input dataframe : {input_cols}\n"
665
- )
666
- input_df.columns = getattr(estimator, "feature_names_in_")
667
- else:
668
- # Just rename the column names to unquoted identifiers.
669
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
667
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
668
+ missing_features = []
669
+ features_in_dataset = set(dataset.columns)
670
+ columns_to_select = []
671
+ for i, f in enumerate(features_required_by_estimator):
672
+ if (
673
+ i >= len(input_cols)
674
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
675
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
676
+ and quoted_input_cols[i] not in features_in_dataset)
677
+ ):
678
+ missing_features.append(f)
679
+ elif input_cols[i] in features_in_dataset:
680
+ columns_to_select.append(input_cols[i])
681
+ elif unquoted_input_cols[i] in features_in_dataset:
682
+ columns_to_select.append(unquoted_input_cols[i])
683
+ else:
684
+ columns_to_select.append(quoted_input_cols[i])
685
+
686
+ if len(missing_features) > 0:
687
+ raise ValueError(
688
+ "The feature names should match with those that were passed during fit.\n"
689
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
690
+ f"Features in the input dataframe : {input_cols}\n"
691
+ )
692
+ input_df = dataset[columns_to_select]
693
+ input_df.columns = features_required_by_estimator
670
694
 
671
695
  transformed_numpy_array = getattr(estimator, inference_method)(
672
696
  input_df
@@ -747,11 +771,18 @@ class HuberRegressor(BaseTransformer):
747
771
  Transformed dataset.
748
772
  """
749
773
  if isinstance(dataset, DataFrame):
774
+ expected_type_inferred = "float"
775
+ # when it is classifier, infer the datatype from label columns
776
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
777
+ expected_type_inferred = convert_sp_to_sf_type(
778
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
779
+ )
780
+
750
781
  output_df = self._batch_inference(
751
782
  dataset=dataset,
752
783
  inference_method="predict",
753
784
  expected_output_cols_list=self.output_cols,
754
- expected_output_cols_type="float",
785
+ expected_output_cols_type=expected_type_inferred,
755
786
  )
756
787
  elif isinstance(dataset, pd.DataFrame):
757
788
  output_df = self._sklearn_inference(
@@ -822,10 +853,10 @@ class HuberRegressor(BaseTransformer):
822
853
 
823
854
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
824
855
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
825
- Returns an empty list if current object is not a classifier or not yet fitted.
856
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
826
857
  """
827
858
  if getattr(self._sklearn_object, "classes_", None) is None:
828
- return []
859
+ return [output_cols_prefix]
829
860
 
830
861
  classes = self._sklearn_object.classes_
831
862
  if isinstance(classes, numpy.ndarray):
@@ -1050,7 +1081,7 @@ class HuberRegressor(BaseTransformer):
1050
1081
  cp.dump(self._sklearn_object, local_score_file)
1051
1082
 
1052
1083
  # Create temp stage to run score.
1053
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1084
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1054
1085
  session = dataset._session
1055
1086
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1056
1087
  SqlResultValidator(
@@ -1064,8 +1095,9 @@ class HuberRegressor(BaseTransformer):
1064
1095
  expected_value=f"Stage area {score_stage_name} successfully created."
1065
1096
  ).validate()
1066
1097
 
1067
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1068
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1098
+ # Use posixpath to construct stage paths
1099
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1100
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1069
1101
  statement_params = telemetry.get_function_usage_statement_params(
1070
1102
  project=_PROJECT,
1071
1103
  subproject=_SUBPROJECT,
@@ -1091,6 +1123,7 @@ class HuberRegressor(BaseTransformer):
1091
1123
  replace=True,
1092
1124
  session=session,
1093
1125
  statement_params=statement_params,
1126
+ anonymous=True
1094
1127
  )
1095
1128
  def score_wrapper_sproc(
1096
1129
  session: Session,
@@ -1098,7 +1131,8 @@ class HuberRegressor(BaseTransformer):
1098
1131
  stage_score_file_name: str,
1099
1132
  input_cols: List[str],
1100
1133
  label_cols: List[str],
1101
- sample_weight_col: Optional[str]
1134
+ sample_weight_col: Optional[str],
1135
+ statement_params: Dict[str, str]
1102
1136
  ) -> float:
1103
1137
  import cloudpickle as cp
1104
1138
  import numpy as np
@@ -1148,14 +1182,14 @@ class HuberRegressor(BaseTransformer):
1148
1182
  api_calls=[Session.call],
1149
1183
  custom_tags=dict([("autogen", True)]),
1150
1184
  )
1151
- score = session.call(
1152
- score_sproc_name,
1185
+ score = score_wrapper_sproc(
1186
+ session,
1153
1187
  query,
1154
1188
  stage_score_file_name,
1155
1189
  identifier.get_unescaped_names(self.input_cols),
1156
1190
  identifier.get_unescaped_names(self.label_cols),
1157
1191
  identifier.get_unescaped_names(self.sample_weight_col),
1158
- statement_params=statement_params,
1192
+ statement_params,
1159
1193
  )
1160
1194
 
1161
1195
  cleanup_temp_files([local_score_file_name])
@@ -1173,18 +1207,20 @@ class HuberRegressor(BaseTransformer):
1173
1207
  if self._sklearn_object._estimator_type == 'classifier':
1174
1208
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1175
1209
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1176
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1210
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1211
+ ([] if self._drop_input_cols else inputs) + outputs)
1177
1212
  # For regressor, the type of predict is float64
1178
1213
  elif self._sklearn_object._estimator_type == 'regressor':
1179
1214
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1180
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1181
-
1215
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1216
+ ([] if self._drop_input_cols else inputs) + outputs)
1182
1217
  for prob_func in PROB_FUNCTIONS:
1183
1218
  if hasattr(self, prob_func):
1184
1219
  output_cols_prefix: str = f"{prob_func}_"
1185
1220
  output_column_names = self._get_output_column_names(output_cols_prefix)
1186
1221
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1187
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1222
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1223
+ ([] if self._drop_input_cols else inputs) + outputs)
1188
1224
 
1189
1225
  @property
1190
1226
  def model_signatures(self) -> Dict[str, ModelSignature]: