snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -195,7 +197,6 @@ class ShrunkCovariance(BaseTransformer):
195
197
  sample_weight_col: Optional[str] = None,
196
198
  ) -> None:
197
199
  super().__init__()
198
- self.id = str(uuid4()).replace("-", "_").upper()
199
200
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
200
201
 
201
202
  self._deps = list(deps)
@@ -217,6 +218,15 @@ class ShrunkCovariance(BaseTransformer):
217
218
  self.set_drop_input_cols(drop_input_cols)
218
219
  self.set_sample_weight_col(sample_weight_col)
219
220
 
221
+ def _get_rand_id(self) -> str:
222
+ """
223
+ Generate random id to be used in sproc and stage names.
224
+
225
+ Returns:
226
+ Random id string usable in sproc, table, and stage names.
227
+ """
228
+ return str(uuid4()).replace("-", "_").upper()
229
+
220
230
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
221
231
  """
222
232
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -295,7 +305,7 @@ class ShrunkCovariance(BaseTransformer):
295
305
  cp.dump(self._sklearn_object, local_transform_file)
296
306
 
297
307
  # Create temp stage to run fit.
298
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
308
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
299
309
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
300
310
  SqlResultValidator(
301
311
  session=session,
@@ -308,11 +318,12 @@ class ShrunkCovariance(BaseTransformer):
308
318
  expected_value=f"Stage area {transform_stage_name} successfully created."
309
319
  ).validate()
310
320
 
311
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
321
+ # Use posixpath to construct stage paths
322
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
323
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
312
324
  local_result_file_name = get_temp_file_path()
313
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
314
325
 
315
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
326
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
316
327
  statement_params = telemetry.get_function_usage_statement_params(
317
328
  project=_PROJECT,
318
329
  subproject=_SUBPROJECT,
@@ -338,6 +349,7 @@ class ShrunkCovariance(BaseTransformer):
338
349
  replace=True,
339
350
  session=session,
340
351
  statement_params=statement_params,
352
+ anonymous=True
341
353
  )
342
354
  def fit_wrapper_sproc(
343
355
  session: Session,
@@ -346,7 +358,8 @@ class ShrunkCovariance(BaseTransformer):
346
358
  stage_result_file_name: str,
347
359
  input_cols: List[str],
348
360
  label_cols: List[str],
349
- sample_weight_col: Optional[str]
361
+ sample_weight_col: Optional[str],
362
+ statement_params: Dict[str, str]
350
363
  ) -> str:
351
364
  import cloudpickle as cp
352
365
  import numpy as np
@@ -413,15 +426,15 @@ class ShrunkCovariance(BaseTransformer):
413
426
  api_calls=[Session.call],
414
427
  custom_tags=dict([("autogen", True)]),
415
428
  )
416
- sproc_export_file_name = session.call(
417
- fit_sproc_name,
429
+ sproc_export_file_name = fit_wrapper_sproc(
430
+ session,
418
431
  query,
419
432
  stage_transform_file_name,
420
433
  stage_result_file_name,
421
434
  identifier.get_unescaped_names(self.input_cols),
422
435
  identifier.get_unescaped_names(self.label_cols),
423
436
  identifier.get_unescaped_names(self.sample_weight_col),
424
- statement_params=statement_params,
437
+ statement_params,
425
438
  )
426
439
 
427
440
  if "|" in sproc_export_file_name:
@@ -431,7 +444,7 @@ class ShrunkCovariance(BaseTransformer):
431
444
  print("\n".join(fields[1:]))
432
445
 
433
446
  session.file.get(
434
- os.path.join(stage_result_file_name, sproc_export_file_name),
447
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
435
448
  local_result_file_name,
436
449
  statement_params=statement_params
437
450
  )
@@ -477,7 +490,7 @@ class ShrunkCovariance(BaseTransformer):
477
490
 
478
491
  # Register vectorized UDF for batch inference
479
492
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
480
- safe_id=self.id, method=inference_method)
493
+ safe_id=self._get_rand_id(), method=inference_method)
481
494
 
482
495
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
483
496
  # will try to pickle all of self which fails.
@@ -569,7 +582,7 @@ class ShrunkCovariance(BaseTransformer):
569
582
  return transformed_pandas_df.to_dict("records")
570
583
 
571
584
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
572
- safe_id=self.id
585
+ safe_id=self._get_rand_id()
573
586
  )
574
587
 
575
588
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -625,26 +638,37 @@ class ShrunkCovariance(BaseTransformer):
625
638
  # input cols need to match unquoted / quoted
626
639
  input_cols = self.input_cols
627
640
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
641
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
628
642
 
629
643
  estimator = self._sklearn_object
630
644
 
631
- input_df = dataset[input_cols] # Select input columns with quoted column names.
632
- if hasattr(estimator, "feature_names_in_"):
633
- missing_features = []
634
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
635
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
636
- missing_features.append(f)
637
-
638
- if len(missing_features) > 0:
639
- raise ValueError(
640
- "The feature names should match with those that were passed during fit.\n"
641
- f"Features seen during fit call but not present in the input: {missing_features}\n"
642
- f"Features in the input dataframe : {input_cols}\n"
643
- )
644
- input_df.columns = getattr(estimator, "feature_names_in_")
645
- else:
646
- # Just rename the column names to unquoted identifiers.
647
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
645
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
646
+ missing_features = []
647
+ features_in_dataset = set(dataset.columns)
648
+ columns_to_select = []
649
+ for i, f in enumerate(features_required_by_estimator):
650
+ if (
651
+ i >= len(input_cols)
652
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
653
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
654
+ and quoted_input_cols[i] not in features_in_dataset)
655
+ ):
656
+ missing_features.append(f)
657
+ elif input_cols[i] in features_in_dataset:
658
+ columns_to_select.append(input_cols[i])
659
+ elif unquoted_input_cols[i] in features_in_dataset:
660
+ columns_to_select.append(unquoted_input_cols[i])
661
+ else:
662
+ columns_to_select.append(quoted_input_cols[i])
663
+
664
+ if len(missing_features) > 0:
665
+ raise ValueError(
666
+ "The feature names should match with those that were passed during fit.\n"
667
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
668
+ f"Features in the input dataframe : {input_cols}\n"
669
+ )
670
+ input_df = dataset[columns_to_select]
671
+ input_df.columns = features_required_by_estimator
648
672
 
649
673
  transformed_numpy_array = getattr(estimator, inference_method)(
650
674
  input_df
@@ -723,11 +747,18 @@ class ShrunkCovariance(BaseTransformer):
723
747
  Transformed dataset.
724
748
  """
725
749
  if isinstance(dataset, DataFrame):
750
+ expected_type_inferred = ""
751
+ # when it is classifier, infer the datatype from label columns
752
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
753
+ expected_type_inferred = convert_sp_to_sf_type(
754
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
755
+ )
756
+
726
757
  output_df = self._batch_inference(
727
758
  dataset=dataset,
728
759
  inference_method="predict",
729
760
  expected_output_cols_list=self.output_cols,
730
- expected_output_cols_type="",
761
+ expected_output_cols_type=expected_type_inferred,
731
762
  )
732
763
  elif isinstance(dataset, pd.DataFrame):
733
764
  output_df = self._sklearn_inference(
@@ -798,10 +829,10 @@ class ShrunkCovariance(BaseTransformer):
798
829
 
799
830
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
800
831
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
801
- Returns an empty list if current object is not a classifier or not yet fitted.
832
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
802
833
  """
803
834
  if getattr(self._sklearn_object, "classes_", None) is None:
804
- return []
835
+ return [output_cols_prefix]
805
836
 
806
837
  classes = self._sklearn_object.classes_
807
838
  if isinstance(classes, numpy.ndarray):
@@ -1026,7 +1057,7 @@ class ShrunkCovariance(BaseTransformer):
1026
1057
  cp.dump(self._sklearn_object, local_score_file)
1027
1058
 
1028
1059
  # Create temp stage to run score.
1029
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1060
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1030
1061
  session = dataset._session
1031
1062
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1032
1063
  SqlResultValidator(
@@ -1040,8 +1071,9 @@ class ShrunkCovariance(BaseTransformer):
1040
1071
  expected_value=f"Stage area {score_stage_name} successfully created."
1041
1072
  ).validate()
1042
1073
 
1043
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1044
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1074
+ # Use posixpath to construct stage paths
1075
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1076
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1045
1077
  statement_params = telemetry.get_function_usage_statement_params(
1046
1078
  project=_PROJECT,
1047
1079
  subproject=_SUBPROJECT,
@@ -1067,6 +1099,7 @@ class ShrunkCovariance(BaseTransformer):
1067
1099
  replace=True,
1068
1100
  session=session,
1069
1101
  statement_params=statement_params,
1102
+ anonymous=True
1070
1103
  )
1071
1104
  def score_wrapper_sproc(
1072
1105
  session: Session,
@@ -1074,7 +1107,8 @@ class ShrunkCovariance(BaseTransformer):
1074
1107
  stage_score_file_name: str,
1075
1108
  input_cols: List[str],
1076
1109
  label_cols: List[str],
1077
- sample_weight_col: Optional[str]
1110
+ sample_weight_col: Optional[str],
1111
+ statement_params: Dict[str, str]
1078
1112
  ) -> float:
1079
1113
  import cloudpickle as cp
1080
1114
  import numpy as np
@@ -1124,14 +1158,14 @@ class ShrunkCovariance(BaseTransformer):
1124
1158
  api_calls=[Session.call],
1125
1159
  custom_tags=dict([("autogen", True)]),
1126
1160
  )
1127
- score = session.call(
1128
- score_sproc_name,
1161
+ score = score_wrapper_sproc(
1162
+ session,
1129
1163
  query,
1130
1164
  stage_score_file_name,
1131
1165
  identifier.get_unescaped_names(self.input_cols),
1132
1166
  identifier.get_unescaped_names(self.label_cols),
1133
1167
  identifier.get_unescaped_names(self.sample_weight_col),
1134
- statement_params=statement_params,
1168
+ statement_params,
1135
1169
  )
1136
1170
 
1137
1171
  cleanup_temp_files([local_score_file_name])
@@ -1149,18 +1183,20 @@ class ShrunkCovariance(BaseTransformer):
1149
1183
  if self._sklearn_object._estimator_type == 'classifier':
1150
1184
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1151
1185
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1152
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1186
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1187
+ ([] if self._drop_input_cols else inputs) + outputs)
1153
1188
  # For regressor, the type of predict is float64
1154
1189
  elif self._sklearn_object._estimator_type == 'regressor':
1155
1190
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1156
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1157
-
1191
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1192
+ ([] if self._drop_input_cols else inputs) + outputs)
1158
1193
  for prob_func in PROB_FUNCTIONS:
1159
1194
  if hasattr(self, prob_func):
1160
1195
  output_cols_prefix: str = f"{prob_func}_"
1161
1196
  output_column_names = self._get_output_column_names(output_cols_prefix)
1162
1197
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1163
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1198
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1199
+ ([] if self._drop_input_cols else inputs) + outputs)
1164
1200
 
1165
1201
  @property
1166
1202
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -282,7 +284,6 @@ class DictionaryLearning(BaseTransformer):
282
284
  sample_weight_col: Optional[str] = None,
283
285
  ) -> None:
284
286
  super().__init__()
285
- self.id = str(uuid4()).replace("-", "_").upper()
286
287
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
287
288
 
288
289
  self._deps = list(deps)
@@ -318,6 +319,15 @@ class DictionaryLearning(BaseTransformer):
318
319
  self.set_drop_input_cols(drop_input_cols)
319
320
  self.set_sample_weight_col(sample_weight_col)
320
321
 
322
+ def _get_rand_id(self) -> str:
323
+ """
324
+ Generate random id to be used in sproc and stage names.
325
+
326
+ Returns:
327
+ Random id string usable in sproc, table, and stage names.
328
+ """
329
+ return str(uuid4()).replace("-", "_").upper()
330
+
321
331
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
322
332
  """
323
333
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -396,7 +406,7 @@ class DictionaryLearning(BaseTransformer):
396
406
  cp.dump(self._sklearn_object, local_transform_file)
397
407
 
398
408
  # Create temp stage to run fit.
399
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
409
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
400
410
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
401
411
  SqlResultValidator(
402
412
  session=session,
@@ -409,11 +419,12 @@ class DictionaryLearning(BaseTransformer):
409
419
  expected_value=f"Stage area {transform_stage_name} successfully created."
410
420
  ).validate()
411
421
 
412
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
422
+ # Use posixpath to construct stage paths
423
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
424
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
413
425
  local_result_file_name = get_temp_file_path()
414
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
415
426
 
416
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
427
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
417
428
  statement_params = telemetry.get_function_usage_statement_params(
418
429
  project=_PROJECT,
419
430
  subproject=_SUBPROJECT,
@@ -439,6 +450,7 @@ class DictionaryLearning(BaseTransformer):
439
450
  replace=True,
440
451
  session=session,
441
452
  statement_params=statement_params,
453
+ anonymous=True
442
454
  )
443
455
  def fit_wrapper_sproc(
444
456
  session: Session,
@@ -447,7 +459,8 @@ class DictionaryLearning(BaseTransformer):
447
459
  stage_result_file_name: str,
448
460
  input_cols: List[str],
449
461
  label_cols: List[str],
450
- sample_weight_col: Optional[str]
462
+ sample_weight_col: Optional[str],
463
+ statement_params: Dict[str, str]
451
464
  ) -> str:
452
465
  import cloudpickle as cp
453
466
  import numpy as np
@@ -514,15 +527,15 @@ class DictionaryLearning(BaseTransformer):
514
527
  api_calls=[Session.call],
515
528
  custom_tags=dict([("autogen", True)]),
516
529
  )
517
- sproc_export_file_name = session.call(
518
- fit_sproc_name,
530
+ sproc_export_file_name = fit_wrapper_sproc(
531
+ session,
519
532
  query,
520
533
  stage_transform_file_name,
521
534
  stage_result_file_name,
522
535
  identifier.get_unescaped_names(self.input_cols),
523
536
  identifier.get_unescaped_names(self.label_cols),
524
537
  identifier.get_unescaped_names(self.sample_weight_col),
525
- statement_params=statement_params,
538
+ statement_params,
526
539
  )
527
540
 
528
541
  if "|" in sproc_export_file_name:
@@ -532,7 +545,7 @@ class DictionaryLearning(BaseTransformer):
532
545
  print("\n".join(fields[1:]))
533
546
 
534
547
  session.file.get(
535
- os.path.join(stage_result_file_name, sproc_export_file_name),
548
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
536
549
  local_result_file_name,
537
550
  statement_params=statement_params
538
551
  )
@@ -578,7 +591,7 @@ class DictionaryLearning(BaseTransformer):
578
591
 
579
592
  # Register vectorized UDF for batch inference
580
593
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
581
- safe_id=self.id, method=inference_method)
594
+ safe_id=self._get_rand_id(), method=inference_method)
582
595
 
583
596
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
584
597
  # will try to pickle all of self which fails.
@@ -670,7 +683,7 @@ class DictionaryLearning(BaseTransformer):
670
683
  return transformed_pandas_df.to_dict("records")
671
684
 
672
685
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
673
- safe_id=self.id
686
+ safe_id=self._get_rand_id()
674
687
  )
675
688
 
676
689
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -726,26 +739,37 @@ class DictionaryLearning(BaseTransformer):
726
739
  # input cols need to match unquoted / quoted
727
740
  input_cols = self.input_cols
728
741
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
742
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
729
743
 
730
744
  estimator = self._sklearn_object
731
745
 
732
- input_df = dataset[input_cols] # Select input columns with quoted column names.
733
- if hasattr(estimator, "feature_names_in_"):
734
- missing_features = []
735
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
736
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
737
- missing_features.append(f)
738
-
739
- if len(missing_features) > 0:
740
- raise ValueError(
741
- "The feature names should match with those that were passed during fit.\n"
742
- f"Features seen during fit call but not present in the input: {missing_features}\n"
743
- f"Features in the input dataframe : {input_cols}\n"
744
- )
745
- input_df.columns = getattr(estimator, "feature_names_in_")
746
- else:
747
- # Just rename the column names to unquoted identifiers.
748
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
746
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
747
+ missing_features = []
748
+ features_in_dataset = set(dataset.columns)
749
+ columns_to_select = []
750
+ for i, f in enumerate(features_required_by_estimator):
751
+ if (
752
+ i >= len(input_cols)
753
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
754
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
755
+ and quoted_input_cols[i] not in features_in_dataset)
756
+ ):
757
+ missing_features.append(f)
758
+ elif input_cols[i] in features_in_dataset:
759
+ columns_to_select.append(input_cols[i])
760
+ elif unquoted_input_cols[i] in features_in_dataset:
761
+ columns_to_select.append(unquoted_input_cols[i])
762
+ else:
763
+ columns_to_select.append(quoted_input_cols[i])
764
+
765
+ if len(missing_features) > 0:
766
+ raise ValueError(
767
+ "The feature names should match with those that were passed during fit.\n"
768
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
769
+ f"Features in the input dataframe : {input_cols}\n"
770
+ )
771
+ input_df = dataset[columns_to_select]
772
+ input_df.columns = features_required_by_estimator
749
773
 
750
774
  transformed_numpy_array = getattr(estimator, inference_method)(
751
775
  input_df
@@ -824,11 +848,18 @@ class DictionaryLearning(BaseTransformer):
824
848
  Transformed dataset.
825
849
  """
826
850
  if isinstance(dataset, DataFrame):
851
+ expected_type_inferred = ""
852
+ # when it is classifier, infer the datatype from label columns
853
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
854
+ expected_type_inferred = convert_sp_to_sf_type(
855
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
856
+ )
857
+
827
858
  output_df = self._batch_inference(
828
859
  dataset=dataset,
829
860
  inference_method="predict",
830
861
  expected_output_cols_list=self.output_cols,
831
- expected_output_cols_type="",
862
+ expected_output_cols_type=expected_type_inferred,
832
863
  )
833
864
  elif isinstance(dataset, pd.DataFrame):
834
865
  output_df = self._sklearn_inference(
@@ -901,10 +932,10 @@ class DictionaryLearning(BaseTransformer):
901
932
 
902
933
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
903
934
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
904
- Returns an empty list if current object is not a classifier or not yet fitted.
935
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
905
936
  """
906
937
  if getattr(self._sklearn_object, "classes_", None) is None:
907
- return []
938
+ return [output_cols_prefix]
908
939
 
909
940
  classes = self._sklearn_object.classes_
910
941
  if isinstance(classes, numpy.ndarray):
@@ -1129,7 +1160,7 @@ class DictionaryLearning(BaseTransformer):
1129
1160
  cp.dump(self._sklearn_object, local_score_file)
1130
1161
 
1131
1162
  # Create temp stage to run score.
1132
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1163
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1133
1164
  session = dataset._session
1134
1165
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1135
1166
  SqlResultValidator(
@@ -1143,8 +1174,9 @@ class DictionaryLearning(BaseTransformer):
1143
1174
  expected_value=f"Stage area {score_stage_name} successfully created."
1144
1175
  ).validate()
1145
1176
 
1146
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1147
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1177
+ # Use posixpath to construct stage paths
1178
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1179
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1148
1180
  statement_params = telemetry.get_function_usage_statement_params(
1149
1181
  project=_PROJECT,
1150
1182
  subproject=_SUBPROJECT,
@@ -1170,6 +1202,7 @@ class DictionaryLearning(BaseTransformer):
1170
1202
  replace=True,
1171
1203
  session=session,
1172
1204
  statement_params=statement_params,
1205
+ anonymous=True
1173
1206
  )
1174
1207
  def score_wrapper_sproc(
1175
1208
  session: Session,
@@ -1177,7 +1210,8 @@ class DictionaryLearning(BaseTransformer):
1177
1210
  stage_score_file_name: str,
1178
1211
  input_cols: List[str],
1179
1212
  label_cols: List[str],
1180
- sample_weight_col: Optional[str]
1213
+ sample_weight_col: Optional[str],
1214
+ statement_params: Dict[str, str]
1181
1215
  ) -> float:
1182
1216
  import cloudpickle as cp
1183
1217
  import numpy as np
@@ -1227,14 +1261,14 @@ class DictionaryLearning(BaseTransformer):
1227
1261
  api_calls=[Session.call],
1228
1262
  custom_tags=dict([("autogen", True)]),
1229
1263
  )
1230
- score = session.call(
1231
- score_sproc_name,
1264
+ score = score_wrapper_sproc(
1265
+ session,
1232
1266
  query,
1233
1267
  stage_score_file_name,
1234
1268
  identifier.get_unescaped_names(self.input_cols),
1235
1269
  identifier.get_unescaped_names(self.label_cols),
1236
1270
  identifier.get_unescaped_names(self.sample_weight_col),
1237
- statement_params=statement_params,
1271
+ statement_params,
1238
1272
  )
1239
1273
 
1240
1274
  cleanup_temp_files([local_score_file_name])
@@ -1252,18 +1286,20 @@ class DictionaryLearning(BaseTransformer):
1252
1286
  if self._sklearn_object._estimator_type == 'classifier':
1253
1287
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1254
1288
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1255
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1289
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1290
+ ([] if self._drop_input_cols else inputs) + outputs)
1256
1291
  # For regressor, the type of predict is float64
1257
1292
  elif self._sklearn_object._estimator_type == 'regressor':
1258
1293
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1259
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1260
-
1294
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1295
+ ([] if self._drop_input_cols else inputs) + outputs)
1261
1296
  for prob_func in PROB_FUNCTIONS:
1262
1297
  if hasattr(self, prob_func):
1263
1298
  output_cols_prefix: str = f"{prob_func}_"
1264
1299
  output_column_names = self._get_output_column_names(output_cols_prefix)
1265
1300
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1266
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1301
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1302
+ ([] if self._drop_input_cols else inputs) + outputs)
1267
1303
 
1268
1304
  @property
1269
1305
  def model_signatures(self) -> Dict[str, ModelSignature]: