snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -323,7 +325,6 @@ class SGDRegressor(BaseTransformer):
323
325
  sample_weight_col: Optional[str] = None,
324
326
  ) -> None:
325
327
  super().__init__()
326
- self.id = str(uuid4()).replace("-", "_").upper()
327
328
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
328
329
 
329
330
  self._deps = list(deps)
@@ -361,6 +362,15 @@ class SGDRegressor(BaseTransformer):
361
362
  self.set_drop_input_cols(drop_input_cols)
362
363
  self.set_sample_weight_col(sample_weight_col)
363
364
 
365
+ def _get_rand_id(self) -> str:
366
+ """
367
+ Generate random id to be used in sproc and stage names.
368
+
369
+ Returns:
370
+ Random id string usable in sproc, table, and stage names.
371
+ """
372
+ return str(uuid4()).replace("-", "_").upper()
373
+
364
374
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
365
375
  """
366
376
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -439,7 +449,7 @@ class SGDRegressor(BaseTransformer):
439
449
  cp.dump(self._sklearn_object, local_transform_file)
440
450
 
441
451
  # Create temp stage to run fit.
442
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
452
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
443
453
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
444
454
  SqlResultValidator(
445
455
  session=session,
@@ -452,11 +462,12 @@ class SGDRegressor(BaseTransformer):
452
462
  expected_value=f"Stage area {transform_stage_name} successfully created."
453
463
  ).validate()
454
464
 
455
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
465
+ # Use posixpath to construct stage paths
466
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
467
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
456
468
  local_result_file_name = get_temp_file_path()
457
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
458
469
 
459
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
470
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
460
471
  statement_params = telemetry.get_function_usage_statement_params(
461
472
  project=_PROJECT,
462
473
  subproject=_SUBPROJECT,
@@ -482,6 +493,7 @@ class SGDRegressor(BaseTransformer):
482
493
  replace=True,
483
494
  session=session,
484
495
  statement_params=statement_params,
496
+ anonymous=True
485
497
  )
486
498
  def fit_wrapper_sproc(
487
499
  session: Session,
@@ -490,7 +502,8 @@ class SGDRegressor(BaseTransformer):
490
502
  stage_result_file_name: str,
491
503
  input_cols: List[str],
492
504
  label_cols: List[str],
493
- sample_weight_col: Optional[str]
505
+ sample_weight_col: Optional[str],
506
+ statement_params: Dict[str, str]
494
507
  ) -> str:
495
508
  import cloudpickle as cp
496
509
  import numpy as np
@@ -557,15 +570,15 @@ class SGDRegressor(BaseTransformer):
557
570
  api_calls=[Session.call],
558
571
  custom_tags=dict([("autogen", True)]),
559
572
  )
560
- sproc_export_file_name = session.call(
561
- fit_sproc_name,
573
+ sproc_export_file_name = fit_wrapper_sproc(
574
+ session,
562
575
  query,
563
576
  stage_transform_file_name,
564
577
  stage_result_file_name,
565
578
  identifier.get_unescaped_names(self.input_cols),
566
579
  identifier.get_unescaped_names(self.label_cols),
567
580
  identifier.get_unescaped_names(self.sample_weight_col),
568
- statement_params=statement_params,
581
+ statement_params,
569
582
  )
570
583
 
571
584
  if "|" in sproc_export_file_name:
@@ -575,7 +588,7 @@ class SGDRegressor(BaseTransformer):
575
588
  print("\n".join(fields[1:]))
576
589
 
577
590
  session.file.get(
578
- os.path.join(stage_result_file_name, sproc_export_file_name),
591
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
579
592
  local_result_file_name,
580
593
  statement_params=statement_params
581
594
  )
@@ -621,7 +634,7 @@ class SGDRegressor(BaseTransformer):
621
634
 
622
635
  # Register vectorized UDF for batch inference
623
636
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
624
- safe_id=self.id, method=inference_method)
637
+ safe_id=self._get_rand_id(), method=inference_method)
625
638
 
626
639
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
627
640
  # will try to pickle all of self which fails.
@@ -713,7 +726,7 @@ class SGDRegressor(BaseTransformer):
713
726
  return transformed_pandas_df.to_dict("records")
714
727
 
715
728
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
716
- safe_id=self.id
729
+ safe_id=self._get_rand_id()
717
730
  )
718
731
 
719
732
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -769,26 +782,37 @@ class SGDRegressor(BaseTransformer):
769
782
  # input cols need to match unquoted / quoted
770
783
  input_cols = self.input_cols
771
784
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
785
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
772
786
 
773
787
  estimator = self._sklearn_object
774
788
 
775
- input_df = dataset[input_cols] # Select input columns with quoted column names.
776
- if hasattr(estimator, "feature_names_in_"):
777
- missing_features = []
778
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
779
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
780
- missing_features.append(f)
781
-
782
- if len(missing_features) > 0:
783
- raise ValueError(
784
- "The feature names should match with those that were passed during fit.\n"
785
- f"Features seen during fit call but not present in the input: {missing_features}\n"
786
- f"Features in the input dataframe : {input_cols}\n"
787
- )
788
- input_df.columns = getattr(estimator, "feature_names_in_")
789
- else:
790
- # Just rename the column names to unquoted identifiers.
791
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
789
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
790
+ missing_features = []
791
+ features_in_dataset = set(dataset.columns)
792
+ columns_to_select = []
793
+ for i, f in enumerate(features_required_by_estimator):
794
+ if (
795
+ i >= len(input_cols)
796
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
797
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
798
+ and quoted_input_cols[i] not in features_in_dataset)
799
+ ):
800
+ missing_features.append(f)
801
+ elif input_cols[i] in features_in_dataset:
802
+ columns_to_select.append(input_cols[i])
803
+ elif unquoted_input_cols[i] in features_in_dataset:
804
+ columns_to_select.append(unquoted_input_cols[i])
805
+ else:
806
+ columns_to_select.append(quoted_input_cols[i])
807
+
808
+ if len(missing_features) > 0:
809
+ raise ValueError(
810
+ "The feature names should match with those that were passed during fit.\n"
811
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
812
+ f"Features in the input dataframe : {input_cols}\n"
813
+ )
814
+ input_df = dataset[columns_to_select]
815
+ input_df.columns = features_required_by_estimator
792
816
 
793
817
  transformed_numpy_array = getattr(estimator, inference_method)(
794
818
  input_df
@@ -869,11 +893,18 @@ class SGDRegressor(BaseTransformer):
869
893
  Transformed dataset.
870
894
  """
871
895
  if isinstance(dataset, DataFrame):
896
+ expected_type_inferred = "float"
897
+ # when it is classifier, infer the datatype from label columns
898
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
899
+ expected_type_inferred = convert_sp_to_sf_type(
900
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
901
+ )
902
+
872
903
  output_df = self._batch_inference(
873
904
  dataset=dataset,
874
905
  inference_method="predict",
875
906
  expected_output_cols_list=self.output_cols,
876
- expected_output_cols_type="float",
907
+ expected_output_cols_type=expected_type_inferred,
877
908
  )
878
909
  elif isinstance(dataset, pd.DataFrame):
879
910
  output_df = self._sklearn_inference(
@@ -944,10 +975,10 @@ class SGDRegressor(BaseTransformer):
944
975
 
945
976
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
946
977
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
947
- Returns an empty list if current object is not a classifier or not yet fitted.
978
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
948
979
  """
949
980
  if getattr(self._sklearn_object, "classes_", None) is None:
950
- return []
981
+ return [output_cols_prefix]
951
982
 
952
983
  classes = self._sklearn_object.classes_
953
984
  if isinstance(classes, numpy.ndarray):
@@ -1172,7 +1203,7 @@ class SGDRegressor(BaseTransformer):
1172
1203
  cp.dump(self._sklearn_object, local_score_file)
1173
1204
 
1174
1205
  # Create temp stage to run score.
1175
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1206
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1176
1207
  session = dataset._session
1177
1208
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1178
1209
  SqlResultValidator(
@@ -1186,8 +1217,9 @@ class SGDRegressor(BaseTransformer):
1186
1217
  expected_value=f"Stage area {score_stage_name} successfully created."
1187
1218
  ).validate()
1188
1219
 
1189
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1190
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1220
+ # Use posixpath to construct stage paths
1221
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1222
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1191
1223
  statement_params = telemetry.get_function_usage_statement_params(
1192
1224
  project=_PROJECT,
1193
1225
  subproject=_SUBPROJECT,
@@ -1213,6 +1245,7 @@ class SGDRegressor(BaseTransformer):
1213
1245
  replace=True,
1214
1246
  session=session,
1215
1247
  statement_params=statement_params,
1248
+ anonymous=True
1216
1249
  )
1217
1250
  def score_wrapper_sproc(
1218
1251
  session: Session,
@@ -1220,7 +1253,8 @@ class SGDRegressor(BaseTransformer):
1220
1253
  stage_score_file_name: str,
1221
1254
  input_cols: List[str],
1222
1255
  label_cols: List[str],
1223
- sample_weight_col: Optional[str]
1256
+ sample_weight_col: Optional[str],
1257
+ statement_params: Dict[str, str]
1224
1258
  ) -> float:
1225
1259
  import cloudpickle as cp
1226
1260
  import numpy as np
@@ -1270,14 +1304,14 @@ class SGDRegressor(BaseTransformer):
1270
1304
  api_calls=[Session.call],
1271
1305
  custom_tags=dict([("autogen", True)]),
1272
1306
  )
1273
- score = session.call(
1274
- score_sproc_name,
1307
+ score = score_wrapper_sproc(
1308
+ session,
1275
1309
  query,
1276
1310
  stage_score_file_name,
1277
1311
  identifier.get_unescaped_names(self.input_cols),
1278
1312
  identifier.get_unescaped_names(self.label_cols),
1279
1313
  identifier.get_unescaped_names(self.sample_weight_col),
1280
- statement_params=statement_params,
1314
+ statement_params,
1281
1315
  )
1282
1316
 
1283
1317
  cleanup_temp_files([local_score_file_name])
@@ -1295,18 +1329,20 @@ class SGDRegressor(BaseTransformer):
1295
1329
  if self._sklearn_object._estimator_type == 'classifier':
1296
1330
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1297
1331
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1298
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1332
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1333
+ ([] if self._drop_input_cols else inputs) + outputs)
1299
1334
  # For regressor, the type of predict is float64
1300
1335
  elif self._sklearn_object._estimator_type == 'regressor':
1301
1336
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1302
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1303
-
1337
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1338
+ ([] if self._drop_input_cols else inputs) + outputs)
1304
1339
  for prob_func in PROB_FUNCTIONS:
1305
1340
  if hasattr(self, prob_func):
1306
1341
  output_cols_prefix: str = f"{prob_func}_"
1307
1342
  output_column_names = self._get_output_column_names(output_cols_prefix)
1308
1343
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1309
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1344
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1345
+ ([] if self._drop_input_cols else inputs) + outputs)
1310
1346
 
1311
1347
  @property
1312
1348
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -235,7 +237,6 @@ class TheilSenRegressor(BaseTransformer):
235
237
  sample_weight_col: Optional[str] = None,
236
238
  ) -> None:
237
239
  super().__init__()
238
- self.id = str(uuid4()).replace("-", "_").upper()
239
240
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
240
241
 
241
242
  self._deps = list(deps)
@@ -263,6 +264,15 @@ class TheilSenRegressor(BaseTransformer):
263
264
  self.set_drop_input_cols(drop_input_cols)
264
265
  self.set_sample_weight_col(sample_weight_col)
265
266
 
267
+ def _get_rand_id(self) -> str:
268
+ """
269
+ Generate random id to be used in sproc and stage names.
270
+
271
+ Returns:
272
+ Random id string usable in sproc, table, and stage names.
273
+ """
274
+ return str(uuid4()).replace("-", "_").upper()
275
+
266
276
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
267
277
  """
268
278
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -341,7 +351,7 @@ class TheilSenRegressor(BaseTransformer):
341
351
  cp.dump(self._sklearn_object, local_transform_file)
342
352
 
343
353
  # Create temp stage to run fit.
344
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
354
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
345
355
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
346
356
  SqlResultValidator(
347
357
  session=session,
@@ -354,11 +364,12 @@ class TheilSenRegressor(BaseTransformer):
354
364
  expected_value=f"Stage area {transform_stage_name} successfully created."
355
365
  ).validate()
356
366
 
357
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
367
+ # Use posixpath to construct stage paths
368
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
369
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
358
370
  local_result_file_name = get_temp_file_path()
359
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
360
371
 
361
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
372
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
362
373
  statement_params = telemetry.get_function_usage_statement_params(
363
374
  project=_PROJECT,
364
375
  subproject=_SUBPROJECT,
@@ -384,6 +395,7 @@ class TheilSenRegressor(BaseTransformer):
384
395
  replace=True,
385
396
  session=session,
386
397
  statement_params=statement_params,
398
+ anonymous=True
387
399
  )
388
400
  def fit_wrapper_sproc(
389
401
  session: Session,
@@ -392,7 +404,8 @@ class TheilSenRegressor(BaseTransformer):
392
404
  stage_result_file_name: str,
393
405
  input_cols: List[str],
394
406
  label_cols: List[str],
395
- sample_weight_col: Optional[str]
407
+ sample_weight_col: Optional[str],
408
+ statement_params: Dict[str, str]
396
409
  ) -> str:
397
410
  import cloudpickle as cp
398
411
  import numpy as np
@@ -459,15 +472,15 @@ class TheilSenRegressor(BaseTransformer):
459
472
  api_calls=[Session.call],
460
473
  custom_tags=dict([("autogen", True)]),
461
474
  )
462
- sproc_export_file_name = session.call(
463
- fit_sproc_name,
475
+ sproc_export_file_name = fit_wrapper_sproc(
476
+ session,
464
477
  query,
465
478
  stage_transform_file_name,
466
479
  stage_result_file_name,
467
480
  identifier.get_unescaped_names(self.input_cols),
468
481
  identifier.get_unescaped_names(self.label_cols),
469
482
  identifier.get_unescaped_names(self.sample_weight_col),
470
- statement_params=statement_params,
483
+ statement_params,
471
484
  )
472
485
 
473
486
  if "|" in sproc_export_file_name:
@@ -477,7 +490,7 @@ class TheilSenRegressor(BaseTransformer):
477
490
  print("\n".join(fields[1:]))
478
491
 
479
492
  session.file.get(
480
- os.path.join(stage_result_file_name, sproc_export_file_name),
493
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
481
494
  local_result_file_name,
482
495
  statement_params=statement_params
483
496
  )
@@ -523,7 +536,7 @@ class TheilSenRegressor(BaseTransformer):
523
536
 
524
537
  # Register vectorized UDF for batch inference
525
538
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
526
- safe_id=self.id, method=inference_method)
539
+ safe_id=self._get_rand_id(), method=inference_method)
527
540
 
528
541
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
529
542
  # will try to pickle all of self which fails.
@@ -615,7 +628,7 @@ class TheilSenRegressor(BaseTransformer):
615
628
  return transformed_pandas_df.to_dict("records")
616
629
 
617
630
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
618
- safe_id=self.id
631
+ safe_id=self._get_rand_id()
619
632
  )
620
633
 
621
634
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -671,26 +684,37 @@ class TheilSenRegressor(BaseTransformer):
671
684
  # input cols need to match unquoted / quoted
672
685
  input_cols = self.input_cols
673
686
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
687
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
674
688
 
675
689
  estimator = self._sklearn_object
676
690
 
677
- input_df = dataset[input_cols] # Select input columns with quoted column names.
678
- if hasattr(estimator, "feature_names_in_"):
679
- missing_features = []
680
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
681
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
682
- missing_features.append(f)
683
-
684
- if len(missing_features) > 0:
685
- raise ValueError(
686
- "The feature names should match with those that were passed during fit.\n"
687
- f"Features seen during fit call but not present in the input: {missing_features}\n"
688
- f"Features in the input dataframe : {input_cols}\n"
689
- )
690
- input_df.columns = getattr(estimator, "feature_names_in_")
691
- else:
692
- # Just rename the column names to unquoted identifiers.
693
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
691
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
692
+ missing_features = []
693
+ features_in_dataset = set(dataset.columns)
694
+ columns_to_select = []
695
+ for i, f in enumerate(features_required_by_estimator):
696
+ if (
697
+ i >= len(input_cols)
698
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
699
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
700
+ and quoted_input_cols[i] not in features_in_dataset)
701
+ ):
702
+ missing_features.append(f)
703
+ elif input_cols[i] in features_in_dataset:
704
+ columns_to_select.append(input_cols[i])
705
+ elif unquoted_input_cols[i] in features_in_dataset:
706
+ columns_to_select.append(unquoted_input_cols[i])
707
+ else:
708
+ columns_to_select.append(quoted_input_cols[i])
709
+
710
+ if len(missing_features) > 0:
711
+ raise ValueError(
712
+ "The feature names should match with those that were passed during fit.\n"
713
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
714
+ f"Features in the input dataframe : {input_cols}\n"
715
+ )
716
+ input_df = dataset[columns_to_select]
717
+ input_df.columns = features_required_by_estimator
694
718
 
695
719
  transformed_numpy_array = getattr(estimator, inference_method)(
696
720
  input_df
@@ -771,11 +795,18 @@ class TheilSenRegressor(BaseTransformer):
771
795
  Transformed dataset.
772
796
  """
773
797
  if isinstance(dataset, DataFrame):
798
+ expected_type_inferred = "float"
799
+ # when it is classifier, infer the datatype from label columns
800
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
801
+ expected_type_inferred = convert_sp_to_sf_type(
802
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
803
+ )
804
+
774
805
  output_df = self._batch_inference(
775
806
  dataset=dataset,
776
807
  inference_method="predict",
777
808
  expected_output_cols_list=self.output_cols,
778
- expected_output_cols_type="float",
809
+ expected_output_cols_type=expected_type_inferred,
779
810
  )
780
811
  elif isinstance(dataset, pd.DataFrame):
781
812
  output_df = self._sklearn_inference(
@@ -846,10 +877,10 @@ class TheilSenRegressor(BaseTransformer):
846
877
 
847
878
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
848
879
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
849
- Returns an empty list if current object is not a classifier or not yet fitted.
880
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
850
881
  """
851
882
  if getattr(self._sklearn_object, "classes_", None) is None:
852
- return []
883
+ return [output_cols_prefix]
853
884
 
854
885
  classes = self._sklearn_object.classes_
855
886
  if isinstance(classes, numpy.ndarray):
@@ -1074,7 +1105,7 @@ class TheilSenRegressor(BaseTransformer):
1074
1105
  cp.dump(self._sklearn_object, local_score_file)
1075
1106
 
1076
1107
  # Create temp stage to run score.
1077
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1108
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1078
1109
  session = dataset._session
1079
1110
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1080
1111
  SqlResultValidator(
@@ -1088,8 +1119,9 @@ class TheilSenRegressor(BaseTransformer):
1088
1119
  expected_value=f"Stage area {score_stage_name} successfully created."
1089
1120
  ).validate()
1090
1121
 
1091
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1092
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1122
+ # Use posixpath to construct stage paths
1123
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1124
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1093
1125
  statement_params = telemetry.get_function_usage_statement_params(
1094
1126
  project=_PROJECT,
1095
1127
  subproject=_SUBPROJECT,
@@ -1115,6 +1147,7 @@ class TheilSenRegressor(BaseTransformer):
1115
1147
  replace=True,
1116
1148
  session=session,
1117
1149
  statement_params=statement_params,
1150
+ anonymous=True
1118
1151
  )
1119
1152
  def score_wrapper_sproc(
1120
1153
  session: Session,
@@ -1122,7 +1155,8 @@ class TheilSenRegressor(BaseTransformer):
1122
1155
  stage_score_file_name: str,
1123
1156
  input_cols: List[str],
1124
1157
  label_cols: List[str],
1125
- sample_weight_col: Optional[str]
1158
+ sample_weight_col: Optional[str],
1159
+ statement_params: Dict[str, str]
1126
1160
  ) -> float:
1127
1161
  import cloudpickle as cp
1128
1162
  import numpy as np
@@ -1172,14 +1206,14 @@ class TheilSenRegressor(BaseTransformer):
1172
1206
  api_calls=[Session.call],
1173
1207
  custom_tags=dict([("autogen", True)]),
1174
1208
  )
1175
- score = session.call(
1176
- score_sproc_name,
1209
+ score = score_wrapper_sproc(
1210
+ session,
1177
1211
  query,
1178
1212
  stage_score_file_name,
1179
1213
  identifier.get_unescaped_names(self.input_cols),
1180
1214
  identifier.get_unescaped_names(self.label_cols),
1181
1215
  identifier.get_unescaped_names(self.sample_weight_col),
1182
- statement_params=statement_params,
1216
+ statement_params,
1183
1217
  )
1184
1218
 
1185
1219
  cleanup_temp_files([local_score_file_name])
@@ -1197,18 +1231,20 @@ class TheilSenRegressor(BaseTransformer):
1197
1231
  if self._sklearn_object._estimator_type == 'classifier':
1198
1232
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1199
1233
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1200
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1234
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1235
+ ([] if self._drop_input_cols else inputs) + outputs)
1201
1236
  # For regressor, the type of predict is float64
1202
1237
  elif self._sklearn_object._estimator_type == 'regressor':
1203
1238
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1204
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1205
-
1239
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1240
+ ([] if self._drop_input_cols else inputs) + outputs)
1206
1241
  for prob_func in PROB_FUNCTIONS:
1207
1242
  if hasattr(self, prob_func):
1208
1243
  output_cols_prefix: str = f"{prob_func}_"
1209
1244
  output_column_names = self._get_output_column_names(output_cols_prefix)
1210
1245
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1211
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1246
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1247
+ ([] if self._drop_input_cols else inputs) + outputs)
1212
1248
 
1213
1249
  @property
1214
1250
  def model_signatures(self) -> Dict[str, ModelSignature]: