snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -207,7 +209,6 @@ class MinCovDet(BaseTransformer):
207
209
  sample_weight_col: Optional[str] = None,
208
210
  ) -> None:
209
211
  super().__init__()
210
- self.id = str(uuid4()).replace("-", "_").upper()
211
212
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
212
213
 
213
214
  self._deps = list(deps)
@@ -230,6 +231,15 @@ class MinCovDet(BaseTransformer):
230
231
  self.set_drop_input_cols(drop_input_cols)
231
232
  self.set_sample_weight_col(sample_weight_col)
232
233
 
234
+ def _get_rand_id(self) -> str:
235
+ """
236
+ Generate random id to be used in sproc and stage names.
237
+
238
+ Returns:
239
+ Random id string usable in sproc, table, and stage names.
240
+ """
241
+ return str(uuid4()).replace("-", "_").upper()
242
+
233
243
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
234
244
  """
235
245
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -308,7 +318,7 @@ class MinCovDet(BaseTransformer):
308
318
  cp.dump(self._sklearn_object, local_transform_file)
309
319
 
310
320
  # Create temp stage to run fit.
311
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
321
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
312
322
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
313
323
  SqlResultValidator(
314
324
  session=session,
@@ -321,11 +331,12 @@ class MinCovDet(BaseTransformer):
321
331
  expected_value=f"Stage area {transform_stage_name} successfully created."
322
332
  ).validate()
323
333
 
324
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
334
+ # Use posixpath to construct stage paths
335
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
336
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
325
337
  local_result_file_name = get_temp_file_path()
326
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
327
338
 
328
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
339
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
329
340
  statement_params = telemetry.get_function_usage_statement_params(
330
341
  project=_PROJECT,
331
342
  subproject=_SUBPROJECT,
@@ -351,6 +362,7 @@ class MinCovDet(BaseTransformer):
351
362
  replace=True,
352
363
  session=session,
353
364
  statement_params=statement_params,
365
+ anonymous=True
354
366
  )
355
367
  def fit_wrapper_sproc(
356
368
  session: Session,
@@ -359,7 +371,8 @@ class MinCovDet(BaseTransformer):
359
371
  stage_result_file_name: str,
360
372
  input_cols: List[str],
361
373
  label_cols: List[str],
362
- sample_weight_col: Optional[str]
374
+ sample_weight_col: Optional[str],
375
+ statement_params: Dict[str, str]
363
376
  ) -> str:
364
377
  import cloudpickle as cp
365
378
  import numpy as np
@@ -426,15 +439,15 @@ class MinCovDet(BaseTransformer):
426
439
  api_calls=[Session.call],
427
440
  custom_tags=dict([("autogen", True)]),
428
441
  )
429
- sproc_export_file_name = session.call(
430
- fit_sproc_name,
442
+ sproc_export_file_name = fit_wrapper_sproc(
443
+ session,
431
444
  query,
432
445
  stage_transform_file_name,
433
446
  stage_result_file_name,
434
447
  identifier.get_unescaped_names(self.input_cols),
435
448
  identifier.get_unescaped_names(self.label_cols),
436
449
  identifier.get_unescaped_names(self.sample_weight_col),
437
- statement_params=statement_params,
450
+ statement_params,
438
451
  )
439
452
 
440
453
  if "|" in sproc_export_file_name:
@@ -444,7 +457,7 @@ class MinCovDet(BaseTransformer):
444
457
  print("\n".join(fields[1:]))
445
458
 
446
459
  session.file.get(
447
- os.path.join(stage_result_file_name, sproc_export_file_name),
460
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
448
461
  local_result_file_name,
449
462
  statement_params=statement_params
450
463
  )
@@ -490,7 +503,7 @@ class MinCovDet(BaseTransformer):
490
503
 
491
504
  # Register vectorized UDF for batch inference
492
505
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
493
- safe_id=self.id, method=inference_method)
506
+ safe_id=self._get_rand_id(), method=inference_method)
494
507
 
495
508
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
496
509
  # will try to pickle all of self which fails.
@@ -582,7 +595,7 @@ class MinCovDet(BaseTransformer):
582
595
  return transformed_pandas_df.to_dict("records")
583
596
 
584
597
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
585
- safe_id=self.id
598
+ safe_id=self._get_rand_id()
586
599
  )
587
600
 
588
601
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -638,26 +651,37 @@ class MinCovDet(BaseTransformer):
638
651
  # input cols need to match unquoted / quoted
639
652
  input_cols = self.input_cols
640
653
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
654
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
641
655
 
642
656
  estimator = self._sklearn_object
643
657
 
644
- input_df = dataset[input_cols] # Select input columns with quoted column names.
645
- if hasattr(estimator, "feature_names_in_"):
646
- missing_features = []
647
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
648
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
649
- missing_features.append(f)
650
-
651
- if len(missing_features) > 0:
652
- raise ValueError(
653
- "The feature names should match with those that were passed during fit.\n"
654
- f"Features seen during fit call but not present in the input: {missing_features}\n"
655
- f"Features in the input dataframe : {input_cols}\n"
656
- )
657
- input_df.columns = getattr(estimator, "feature_names_in_")
658
- else:
659
- # Just rename the column names to unquoted identifiers.
660
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
658
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
659
+ missing_features = []
660
+ features_in_dataset = set(dataset.columns)
661
+ columns_to_select = []
662
+ for i, f in enumerate(features_required_by_estimator):
663
+ if (
664
+ i >= len(input_cols)
665
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
666
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
667
+ and quoted_input_cols[i] not in features_in_dataset)
668
+ ):
669
+ missing_features.append(f)
670
+ elif input_cols[i] in features_in_dataset:
671
+ columns_to_select.append(input_cols[i])
672
+ elif unquoted_input_cols[i] in features_in_dataset:
673
+ columns_to_select.append(unquoted_input_cols[i])
674
+ else:
675
+ columns_to_select.append(quoted_input_cols[i])
676
+
677
+ if len(missing_features) > 0:
678
+ raise ValueError(
679
+ "The feature names should match with those that were passed during fit.\n"
680
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
681
+ f"Features in the input dataframe : {input_cols}\n"
682
+ )
683
+ input_df = dataset[columns_to_select]
684
+ input_df.columns = features_required_by_estimator
661
685
 
662
686
  transformed_numpy_array = getattr(estimator, inference_method)(
663
687
  input_df
@@ -736,11 +760,18 @@ class MinCovDet(BaseTransformer):
736
760
  Transformed dataset.
737
761
  """
738
762
  if isinstance(dataset, DataFrame):
763
+ expected_type_inferred = ""
764
+ # when it is classifier, infer the datatype from label columns
765
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
766
+ expected_type_inferred = convert_sp_to_sf_type(
767
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
768
+ )
769
+
739
770
  output_df = self._batch_inference(
740
771
  dataset=dataset,
741
772
  inference_method="predict",
742
773
  expected_output_cols_list=self.output_cols,
743
- expected_output_cols_type="",
774
+ expected_output_cols_type=expected_type_inferred,
744
775
  )
745
776
  elif isinstance(dataset, pd.DataFrame):
746
777
  output_df = self._sklearn_inference(
@@ -811,10 +842,10 @@ class MinCovDet(BaseTransformer):
811
842
 
812
843
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
813
844
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
814
- Returns an empty list if current object is not a classifier or not yet fitted.
845
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
815
846
  """
816
847
  if getattr(self._sklearn_object, "classes_", None) is None:
817
- return []
848
+ return [output_cols_prefix]
818
849
 
819
850
  classes = self._sklearn_object.classes_
820
851
  if isinstance(classes, numpy.ndarray):
@@ -1039,7 +1070,7 @@ class MinCovDet(BaseTransformer):
1039
1070
  cp.dump(self._sklearn_object, local_score_file)
1040
1071
 
1041
1072
  # Create temp stage to run score.
1042
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1073
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1043
1074
  session = dataset._session
1044
1075
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1045
1076
  SqlResultValidator(
@@ -1053,8 +1084,9 @@ class MinCovDet(BaseTransformer):
1053
1084
  expected_value=f"Stage area {score_stage_name} successfully created."
1054
1085
  ).validate()
1055
1086
 
1056
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1057
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1087
+ # Use posixpath to construct stage paths
1088
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1089
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1058
1090
  statement_params = telemetry.get_function_usage_statement_params(
1059
1091
  project=_PROJECT,
1060
1092
  subproject=_SUBPROJECT,
@@ -1080,6 +1112,7 @@ class MinCovDet(BaseTransformer):
1080
1112
  replace=True,
1081
1113
  session=session,
1082
1114
  statement_params=statement_params,
1115
+ anonymous=True
1083
1116
  )
1084
1117
  def score_wrapper_sproc(
1085
1118
  session: Session,
@@ -1087,7 +1120,8 @@ class MinCovDet(BaseTransformer):
1087
1120
  stage_score_file_name: str,
1088
1121
  input_cols: List[str],
1089
1122
  label_cols: List[str],
1090
- sample_weight_col: Optional[str]
1123
+ sample_weight_col: Optional[str],
1124
+ statement_params: Dict[str, str]
1091
1125
  ) -> float:
1092
1126
  import cloudpickle as cp
1093
1127
  import numpy as np
@@ -1137,14 +1171,14 @@ class MinCovDet(BaseTransformer):
1137
1171
  api_calls=[Session.call],
1138
1172
  custom_tags=dict([("autogen", True)]),
1139
1173
  )
1140
- score = session.call(
1141
- score_sproc_name,
1174
+ score = score_wrapper_sproc(
1175
+ session,
1142
1176
  query,
1143
1177
  stage_score_file_name,
1144
1178
  identifier.get_unescaped_names(self.input_cols),
1145
1179
  identifier.get_unescaped_names(self.label_cols),
1146
1180
  identifier.get_unescaped_names(self.sample_weight_col),
1147
- statement_params=statement_params,
1181
+ statement_params,
1148
1182
  )
1149
1183
 
1150
1184
  cleanup_temp_files([local_score_file_name])
@@ -1162,18 +1196,20 @@ class MinCovDet(BaseTransformer):
1162
1196
  if self._sklearn_object._estimator_type == 'classifier':
1163
1197
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1164
1198
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1165
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1199
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1200
+ ([] if self._drop_input_cols else inputs) + outputs)
1166
1201
  # For regressor, the type of predict is float64
1167
1202
  elif self._sklearn_object._estimator_type == 'regressor':
1168
1203
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1169
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1170
-
1204
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1205
+ ([] if self._drop_input_cols else inputs) + outputs)
1171
1206
  for prob_func in PROB_FUNCTIONS:
1172
1207
  if hasattr(self, prob_func):
1173
1208
  output_cols_prefix: str = f"{prob_func}_"
1174
1209
  output_column_names = self._get_output_column_names(output_cols_prefix)
1175
1210
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1176
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1211
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1212
+ ([] if self._drop_input_cols else inputs) + outputs)
1177
1213
 
1178
1214
  @property
1179
1215
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -190,7 +192,6 @@ class OAS(BaseTransformer):
190
192
  sample_weight_col: Optional[str] = None,
191
193
  ) -> None:
192
194
  super().__init__()
193
- self.id = str(uuid4()).replace("-", "_").upper()
194
195
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
195
196
 
196
197
  self._deps = list(deps)
@@ -211,6 +212,15 @@ class OAS(BaseTransformer):
211
212
  self.set_drop_input_cols(drop_input_cols)
212
213
  self.set_sample_weight_col(sample_weight_col)
213
214
 
215
+ def _get_rand_id(self) -> str:
216
+ """
217
+ Generate random id to be used in sproc and stage names.
218
+
219
+ Returns:
220
+ Random id string usable in sproc, table, and stage names.
221
+ """
222
+ return str(uuid4()).replace("-", "_").upper()
223
+
214
224
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
215
225
  """
216
226
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -289,7 +299,7 @@ class OAS(BaseTransformer):
289
299
  cp.dump(self._sklearn_object, local_transform_file)
290
300
 
291
301
  # Create temp stage to run fit.
292
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
302
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
293
303
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
294
304
  SqlResultValidator(
295
305
  session=session,
@@ -302,11 +312,12 @@ class OAS(BaseTransformer):
302
312
  expected_value=f"Stage area {transform_stage_name} successfully created."
303
313
  ).validate()
304
314
 
305
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
315
+ # Use posixpath to construct stage paths
316
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
317
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
306
318
  local_result_file_name = get_temp_file_path()
307
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
308
319
 
309
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
320
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
310
321
  statement_params = telemetry.get_function_usage_statement_params(
311
322
  project=_PROJECT,
312
323
  subproject=_SUBPROJECT,
@@ -332,6 +343,7 @@ class OAS(BaseTransformer):
332
343
  replace=True,
333
344
  session=session,
334
345
  statement_params=statement_params,
346
+ anonymous=True
335
347
  )
336
348
  def fit_wrapper_sproc(
337
349
  session: Session,
@@ -340,7 +352,8 @@ class OAS(BaseTransformer):
340
352
  stage_result_file_name: str,
341
353
  input_cols: List[str],
342
354
  label_cols: List[str],
343
- sample_weight_col: Optional[str]
355
+ sample_weight_col: Optional[str],
356
+ statement_params: Dict[str, str]
344
357
  ) -> str:
345
358
  import cloudpickle as cp
346
359
  import numpy as np
@@ -407,15 +420,15 @@ class OAS(BaseTransformer):
407
420
  api_calls=[Session.call],
408
421
  custom_tags=dict([("autogen", True)]),
409
422
  )
410
- sproc_export_file_name = session.call(
411
- fit_sproc_name,
423
+ sproc_export_file_name = fit_wrapper_sproc(
424
+ session,
412
425
  query,
413
426
  stage_transform_file_name,
414
427
  stage_result_file_name,
415
428
  identifier.get_unescaped_names(self.input_cols),
416
429
  identifier.get_unescaped_names(self.label_cols),
417
430
  identifier.get_unescaped_names(self.sample_weight_col),
418
- statement_params=statement_params,
431
+ statement_params,
419
432
  )
420
433
 
421
434
  if "|" in sproc_export_file_name:
@@ -425,7 +438,7 @@ class OAS(BaseTransformer):
425
438
  print("\n".join(fields[1:]))
426
439
 
427
440
  session.file.get(
428
- os.path.join(stage_result_file_name, sproc_export_file_name),
441
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
429
442
  local_result_file_name,
430
443
  statement_params=statement_params
431
444
  )
@@ -471,7 +484,7 @@ class OAS(BaseTransformer):
471
484
 
472
485
  # Register vectorized UDF for batch inference
473
486
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
474
- safe_id=self.id, method=inference_method)
487
+ safe_id=self._get_rand_id(), method=inference_method)
475
488
 
476
489
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
477
490
  # will try to pickle all of self which fails.
@@ -563,7 +576,7 @@ class OAS(BaseTransformer):
563
576
  return transformed_pandas_df.to_dict("records")
564
577
 
565
578
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
566
- safe_id=self.id
579
+ safe_id=self._get_rand_id()
567
580
  )
568
581
 
569
582
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -619,26 +632,37 @@ class OAS(BaseTransformer):
619
632
  # input cols need to match unquoted / quoted
620
633
  input_cols = self.input_cols
621
634
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
635
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
622
636
 
623
637
  estimator = self._sklearn_object
624
638
 
625
- input_df = dataset[input_cols] # Select input columns with quoted column names.
626
- if hasattr(estimator, "feature_names_in_"):
627
- missing_features = []
628
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
629
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
630
- missing_features.append(f)
631
-
632
- if len(missing_features) > 0:
633
- raise ValueError(
634
- "The feature names should match with those that were passed during fit.\n"
635
- f"Features seen during fit call but not present in the input: {missing_features}\n"
636
- f"Features in the input dataframe : {input_cols}\n"
637
- )
638
- input_df.columns = getattr(estimator, "feature_names_in_")
639
- else:
640
- # Just rename the column names to unquoted identifiers.
641
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
639
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
640
+ missing_features = []
641
+ features_in_dataset = set(dataset.columns)
642
+ columns_to_select = []
643
+ for i, f in enumerate(features_required_by_estimator):
644
+ if (
645
+ i >= len(input_cols)
646
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
647
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
648
+ and quoted_input_cols[i] not in features_in_dataset)
649
+ ):
650
+ missing_features.append(f)
651
+ elif input_cols[i] in features_in_dataset:
652
+ columns_to_select.append(input_cols[i])
653
+ elif unquoted_input_cols[i] in features_in_dataset:
654
+ columns_to_select.append(unquoted_input_cols[i])
655
+ else:
656
+ columns_to_select.append(quoted_input_cols[i])
657
+
658
+ if len(missing_features) > 0:
659
+ raise ValueError(
660
+ "The feature names should match with those that were passed during fit.\n"
661
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
662
+ f"Features in the input dataframe : {input_cols}\n"
663
+ )
664
+ input_df = dataset[columns_to_select]
665
+ input_df.columns = features_required_by_estimator
642
666
 
643
667
  transformed_numpy_array = getattr(estimator, inference_method)(
644
668
  input_df
@@ -717,11 +741,18 @@ class OAS(BaseTransformer):
717
741
  Transformed dataset.
718
742
  """
719
743
  if isinstance(dataset, DataFrame):
744
+ expected_type_inferred = ""
745
+ # when it is classifier, infer the datatype from label columns
746
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
747
+ expected_type_inferred = convert_sp_to_sf_type(
748
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
749
+ )
750
+
720
751
  output_df = self._batch_inference(
721
752
  dataset=dataset,
722
753
  inference_method="predict",
723
754
  expected_output_cols_list=self.output_cols,
724
- expected_output_cols_type="",
755
+ expected_output_cols_type=expected_type_inferred,
725
756
  )
726
757
  elif isinstance(dataset, pd.DataFrame):
727
758
  output_df = self._sklearn_inference(
@@ -792,10 +823,10 @@ class OAS(BaseTransformer):
792
823
 
793
824
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
794
825
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
795
- Returns an empty list if current object is not a classifier or not yet fitted.
826
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
796
827
  """
797
828
  if getattr(self._sklearn_object, "classes_", None) is None:
798
- return []
829
+ return [output_cols_prefix]
799
830
 
800
831
  classes = self._sklearn_object.classes_
801
832
  if isinstance(classes, numpy.ndarray):
@@ -1020,7 +1051,7 @@ class OAS(BaseTransformer):
1020
1051
  cp.dump(self._sklearn_object, local_score_file)
1021
1052
 
1022
1053
  # Create temp stage to run score.
1023
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1054
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1024
1055
  session = dataset._session
1025
1056
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1026
1057
  SqlResultValidator(
@@ -1034,8 +1065,9 @@ class OAS(BaseTransformer):
1034
1065
  expected_value=f"Stage area {score_stage_name} successfully created."
1035
1066
  ).validate()
1036
1067
 
1037
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1038
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1068
+ # Use posixpath to construct stage paths
1069
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1070
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1039
1071
  statement_params = telemetry.get_function_usage_statement_params(
1040
1072
  project=_PROJECT,
1041
1073
  subproject=_SUBPROJECT,
@@ -1061,6 +1093,7 @@ class OAS(BaseTransformer):
1061
1093
  replace=True,
1062
1094
  session=session,
1063
1095
  statement_params=statement_params,
1096
+ anonymous=True
1064
1097
  )
1065
1098
  def score_wrapper_sproc(
1066
1099
  session: Session,
@@ -1068,7 +1101,8 @@ class OAS(BaseTransformer):
1068
1101
  stage_score_file_name: str,
1069
1102
  input_cols: List[str],
1070
1103
  label_cols: List[str],
1071
- sample_weight_col: Optional[str]
1104
+ sample_weight_col: Optional[str],
1105
+ statement_params: Dict[str, str]
1072
1106
  ) -> float:
1073
1107
  import cloudpickle as cp
1074
1108
  import numpy as np
@@ -1118,14 +1152,14 @@ class OAS(BaseTransformer):
1118
1152
  api_calls=[Session.call],
1119
1153
  custom_tags=dict([("autogen", True)]),
1120
1154
  )
1121
- score = session.call(
1122
- score_sproc_name,
1155
+ score = score_wrapper_sproc(
1156
+ session,
1123
1157
  query,
1124
1158
  stage_score_file_name,
1125
1159
  identifier.get_unescaped_names(self.input_cols),
1126
1160
  identifier.get_unescaped_names(self.label_cols),
1127
1161
  identifier.get_unescaped_names(self.sample_weight_col),
1128
- statement_params=statement_params,
1162
+ statement_params,
1129
1163
  )
1130
1164
 
1131
1165
  cleanup_temp_files([local_score_file_name])
@@ -1143,18 +1177,20 @@ class OAS(BaseTransformer):
1143
1177
  if self._sklearn_object._estimator_type == 'classifier':
1144
1178
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1145
1179
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1146
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1180
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1181
+ ([] if self._drop_input_cols else inputs) + outputs)
1147
1182
  # For regressor, the type of predict is float64
1148
1183
  elif self._sklearn_object._estimator_type == 'regressor':
1149
1184
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1150
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1151
-
1185
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1186
+ ([] if self._drop_input_cols else inputs) + outputs)
1152
1187
  for prob_func in PROB_FUNCTIONS:
1153
1188
  if hasattr(self, prob_func):
1154
1189
  output_cols_prefix: str = f"{prob_func}_"
1155
1190
  output_column_names = self._get_output_column_names(output_cols_prefix)
1156
1191
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1157
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1192
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1193
+ ([] if self._drop_input_cols else inputs) + outputs)
1158
1194
 
1159
1195
  @property
1160
1196
  def model_signatures(self) -> Dict[str, ModelSignature]: