snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -281,7 +283,6 @@ class MiniBatchKMeans(BaseTransformer):
281
283
  sample_weight_col: Optional[str] = None,
282
284
  ) -> None:
283
285
  super().__init__()
284
- self.id = str(uuid4()).replace("-", "_").upper()
285
286
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
286
287
 
287
288
  self._deps = list(deps)
@@ -312,6 +313,15 @@ class MiniBatchKMeans(BaseTransformer):
312
313
  self.set_drop_input_cols(drop_input_cols)
313
314
  self.set_sample_weight_col(sample_weight_col)
314
315
 
316
+ def _get_rand_id(self) -> str:
317
+ """
318
+ Generate random id to be used in sproc and stage names.
319
+
320
+ Returns:
321
+ Random id string usable in sproc, table, and stage names.
322
+ """
323
+ return str(uuid4()).replace("-", "_").upper()
324
+
315
325
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
316
326
  """
317
327
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -390,7 +400,7 @@ class MiniBatchKMeans(BaseTransformer):
390
400
  cp.dump(self._sklearn_object, local_transform_file)
391
401
 
392
402
  # Create temp stage to run fit.
393
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
403
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
394
404
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
395
405
  SqlResultValidator(
396
406
  session=session,
@@ -403,11 +413,12 @@ class MiniBatchKMeans(BaseTransformer):
403
413
  expected_value=f"Stage area {transform_stage_name} successfully created."
404
414
  ).validate()
405
415
 
406
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
416
+ # Use posixpath to construct stage paths
417
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
418
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
407
419
  local_result_file_name = get_temp_file_path()
408
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
409
420
 
410
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
421
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
411
422
  statement_params = telemetry.get_function_usage_statement_params(
412
423
  project=_PROJECT,
413
424
  subproject=_SUBPROJECT,
@@ -433,6 +444,7 @@ class MiniBatchKMeans(BaseTransformer):
433
444
  replace=True,
434
445
  session=session,
435
446
  statement_params=statement_params,
447
+ anonymous=True
436
448
  )
437
449
  def fit_wrapper_sproc(
438
450
  session: Session,
@@ -441,7 +453,8 @@ class MiniBatchKMeans(BaseTransformer):
441
453
  stage_result_file_name: str,
442
454
  input_cols: List[str],
443
455
  label_cols: List[str],
444
- sample_weight_col: Optional[str]
456
+ sample_weight_col: Optional[str],
457
+ statement_params: Dict[str, str]
445
458
  ) -> str:
446
459
  import cloudpickle as cp
447
460
  import numpy as np
@@ -508,15 +521,15 @@ class MiniBatchKMeans(BaseTransformer):
508
521
  api_calls=[Session.call],
509
522
  custom_tags=dict([("autogen", True)]),
510
523
  )
511
- sproc_export_file_name = session.call(
512
- fit_sproc_name,
524
+ sproc_export_file_name = fit_wrapper_sproc(
525
+ session,
513
526
  query,
514
527
  stage_transform_file_name,
515
528
  stage_result_file_name,
516
529
  identifier.get_unescaped_names(self.input_cols),
517
530
  identifier.get_unescaped_names(self.label_cols),
518
531
  identifier.get_unescaped_names(self.sample_weight_col),
519
- statement_params=statement_params,
532
+ statement_params,
520
533
  )
521
534
 
522
535
  if "|" in sproc_export_file_name:
@@ -526,7 +539,7 @@ class MiniBatchKMeans(BaseTransformer):
526
539
  print("\n".join(fields[1:]))
527
540
 
528
541
  session.file.get(
529
- os.path.join(stage_result_file_name, sproc_export_file_name),
542
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
530
543
  local_result_file_name,
531
544
  statement_params=statement_params
532
545
  )
@@ -572,7 +585,7 @@ class MiniBatchKMeans(BaseTransformer):
572
585
 
573
586
  # Register vectorized UDF for batch inference
574
587
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
575
- safe_id=self.id, method=inference_method)
588
+ safe_id=self._get_rand_id(), method=inference_method)
576
589
 
577
590
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
578
591
  # will try to pickle all of self which fails.
@@ -664,7 +677,7 @@ class MiniBatchKMeans(BaseTransformer):
664
677
  return transformed_pandas_df.to_dict("records")
665
678
 
666
679
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
667
- safe_id=self.id
680
+ safe_id=self._get_rand_id()
668
681
  )
669
682
 
670
683
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -720,26 +733,37 @@ class MiniBatchKMeans(BaseTransformer):
720
733
  # input cols need to match unquoted / quoted
721
734
  input_cols = self.input_cols
722
735
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
736
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
723
737
 
724
738
  estimator = self._sklearn_object
725
739
 
726
- input_df = dataset[input_cols] # Select input columns with quoted column names.
727
- if hasattr(estimator, "feature_names_in_"):
728
- missing_features = []
729
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
730
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
731
- missing_features.append(f)
732
-
733
- if len(missing_features) > 0:
734
- raise ValueError(
735
- "The feature names should match with those that were passed during fit.\n"
736
- f"Features seen during fit call but not present in the input: {missing_features}\n"
737
- f"Features in the input dataframe : {input_cols}\n"
738
- )
739
- input_df.columns = getattr(estimator, "feature_names_in_")
740
- else:
741
- # Just rename the column names to unquoted identifiers.
742
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
740
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
741
+ missing_features = []
742
+ features_in_dataset = set(dataset.columns)
743
+ columns_to_select = []
744
+ for i, f in enumerate(features_required_by_estimator):
745
+ if (
746
+ i >= len(input_cols)
747
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
748
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
749
+ and quoted_input_cols[i] not in features_in_dataset)
750
+ ):
751
+ missing_features.append(f)
752
+ elif input_cols[i] in features_in_dataset:
753
+ columns_to_select.append(input_cols[i])
754
+ elif unquoted_input_cols[i] in features_in_dataset:
755
+ columns_to_select.append(unquoted_input_cols[i])
756
+ else:
757
+ columns_to_select.append(quoted_input_cols[i])
758
+
759
+ if len(missing_features) > 0:
760
+ raise ValueError(
761
+ "The feature names should match with those that were passed during fit.\n"
762
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
763
+ f"Features in the input dataframe : {input_cols}\n"
764
+ )
765
+ input_df = dataset[columns_to_select]
766
+ input_df.columns = features_required_by_estimator
743
767
 
744
768
  transformed_numpy_array = getattr(estimator, inference_method)(
745
769
  input_df
@@ -820,11 +844,18 @@ class MiniBatchKMeans(BaseTransformer):
820
844
  Transformed dataset.
821
845
  """
822
846
  if isinstance(dataset, DataFrame):
847
+ expected_type_inferred = ""
848
+ # when it is classifier, infer the datatype from label columns
849
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
850
+ expected_type_inferred = convert_sp_to_sf_type(
851
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
852
+ )
853
+
823
854
  output_df = self._batch_inference(
824
855
  dataset=dataset,
825
856
  inference_method="predict",
826
857
  expected_output_cols_list=self.output_cols,
827
- expected_output_cols_type="",
858
+ expected_output_cols_type=expected_type_inferred,
828
859
  )
829
860
  elif isinstance(dataset, pd.DataFrame):
830
861
  output_df = self._sklearn_inference(
@@ -897,10 +928,10 @@ class MiniBatchKMeans(BaseTransformer):
897
928
 
898
929
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
899
930
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
900
- Returns an empty list if current object is not a classifier or not yet fitted.
931
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
901
932
  """
902
933
  if getattr(self._sklearn_object, "classes_", None) is None:
903
- return []
934
+ return [output_cols_prefix]
904
935
 
905
936
  classes = self._sklearn_object.classes_
906
937
  if isinstance(classes, numpy.ndarray):
@@ -1125,7 +1156,7 @@ class MiniBatchKMeans(BaseTransformer):
1125
1156
  cp.dump(self._sklearn_object, local_score_file)
1126
1157
 
1127
1158
  # Create temp stage to run score.
1128
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1159
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1129
1160
  session = dataset._session
1130
1161
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1131
1162
  SqlResultValidator(
@@ -1139,8 +1170,9 @@ class MiniBatchKMeans(BaseTransformer):
1139
1170
  expected_value=f"Stage area {score_stage_name} successfully created."
1140
1171
  ).validate()
1141
1172
 
1142
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1143
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1173
+ # Use posixpath to construct stage paths
1174
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1175
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1144
1176
  statement_params = telemetry.get_function_usage_statement_params(
1145
1177
  project=_PROJECT,
1146
1178
  subproject=_SUBPROJECT,
@@ -1166,6 +1198,7 @@ class MiniBatchKMeans(BaseTransformer):
1166
1198
  replace=True,
1167
1199
  session=session,
1168
1200
  statement_params=statement_params,
1201
+ anonymous=True
1169
1202
  )
1170
1203
  def score_wrapper_sproc(
1171
1204
  session: Session,
@@ -1173,7 +1206,8 @@ class MiniBatchKMeans(BaseTransformer):
1173
1206
  stage_score_file_name: str,
1174
1207
  input_cols: List[str],
1175
1208
  label_cols: List[str],
1176
- sample_weight_col: Optional[str]
1209
+ sample_weight_col: Optional[str],
1210
+ statement_params: Dict[str, str]
1177
1211
  ) -> float:
1178
1212
  import cloudpickle as cp
1179
1213
  import numpy as np
@@ -1223,14 +1257,14 @@ class MiniBatchKMeans(BaseTransformer):
1223
1257
  api_calls=[Session.call],
1224
1258
  custom_tags=dict([("autogen", True)]),
1225
1259
  )
1226
- score = session.call(
1227
- score_sproc_name,
1260
+ score = score_wrapper_sproc(
1261
+ session,
1228
1262
  query,
1229
1263
  stage_score_file_name,
1230
1264
  identifier.get_unescaped_names(self.input_cols),
1231
1265
  identifier.get_unescaped_names(self.label_cols),
1232
1266
  identifier.get_unescaped_names(self.sample_weight_col),
1233
- statement_params=statement_params,
1267
+ statement_params,
1234
1268
  )
1235
1269
 
1236
1270
  cleanup_temp_files([local_score_file_name])
@@ -1248,18 +1282,20 @@ class MiniBatchKMeans(BaseTransformer):
1248
1282
  if self._sklearn_object._estimator_type == 'classifier':
1249
1283
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1250
1284
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1251
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1285
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1286
+ ([] if self._drop_input_cols else inputs) + outputs)
1252
1287
  # For regressor, the type of predict is float64
1253
1288
  elif self._sklearn_object._estimator_type == 'regressor':
1254
1289
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1255
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1256
-
1290
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1291
+ ([] if self._drop_input_cols else inputs) + outputs)
1257
1292
  for prob_func in PROB_FUNCTIONS:
1258
1293
  if hasattr(self, prob_func):
1259
1294
  output_cols_prefix: str = f"{prob_func}_"
1260
1295
  output_column_names = self._get_output_column_names(output_cols_prefix)
1261
1296
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1262
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1297
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1298
+ ([] if self._drop_input_cols else inputs) + outputs)
1263
1299
 
1264
1300
  @property
1265
1301
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -299,7 +301,6 @@ class OPTICS(BaseTransformer):
299
301
  sample_weight_col: Optional[str] = None,
300
302
  ) -> None:
301
303
  super().__init__()
302
- self.id = str(uuid4()).replace("-", "_").upper()
303
304
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
304
305
 
305
306
  self._deps = list(deps)
@@ -332,6 +333,15 @@ class OPTICS(BaseTransformer):
332
333
  self.set_drop_input_cols(drop_input_cols)
333
334
  self.set_sample_weight_col(sample_weight_col)
334
335
 
336
+ def _get_rand_id(self) -> str:
337
+ """
338
+ Generate random id to be used in sproc and stage names.
339
+
340
+ Returns:
341
+ Random id string usable in sproc, table, and stage names.
342
+ """
343
+ return str(uuid4()).replace("-", "_").upper()
344
+
335
345
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
336
346
  """
337
347
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -410,7 +420,7 @@ class OPTICS(BaseTransformer):
410
420
  cp.dump(self._sklearn_object, local_transform_file)
411
421
 
412
422
  # Create temp stage to run fit.
413
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
423
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
414
424
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
415
425
  SqlResultValidator(
416
426
  session=session,
@@ -423,11 +433,12 @@ class OPTICS(BaseTransformer):
423
433
  expected_value=f"Stage area {transform_stage_name} successfully created."
424
434
  ).validate()
425
435
 
426
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
436
+ # Use posixpath to construct stage paths
437
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
438
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
427
439
  local_result_file_name = get_temp_file_path()
428
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
429
440
 
430
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
441
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
431
442
  statement_params = telemetry.get_function_usage_statement_params(
432
443
  project=_PROJECT,
433
444
  subproject=_SUBPROJECT,
@@ -453,6 +464,7 @@ class OPTICS(BaseTransformer):
453
464
  replace=True,
454
465
  session=session,
455
466
  statement_params=statement_params,
467
+ anonymous=True
456
468
  )
457
469
  def fit_wrapper_sproc(
458
470
  session: Session,
@@ -461,7 +473,8 @@ class OPTICS(BaseTransformer):
461
473
  stage_result_file_name: str,
462
474
  input_cols: List[str],
463
475
  label_cols: List[str],
464
- sample_weight_col: Optional[str]
476
+ sample_weight_col: Optional[str],
477
+ statement_params: Dict[str, str]
465
478
  ) -> str:
466
479
  import cloudpickle as cp
467
480
  import numpy as np
@@ -528,15 +541,15 @@ class OPTICS(BaseTransformer):
528
541
  api_calls=[Session.call],
529
542
  custom_tags=dict([("autogen", True)]),
530
543
  )
531
- sproc_export_file_name = session.call(
532
- fit_sproc_name,
544
+ sproc_export_file_name = fit_wrapper_sproc(
545
+ session,
533
546
  query,
534
547
  stage_transform_file_name,
535
548
  stage_result_file_name,
536
549
  identifier.get_unescaped_names(self.input_cols),
537
550
  identifier.get_unescaped_names(self.label_cols),
538
551
  identifier.get_unescaped_names(self.sample_weight_col),
539
- statement_params=statement_params,
552
+ statement_params,
540
553
  )
541
554
 
542
555
  if "|" in sproc_export_file_name:
@@ -546,7 +559,7 @@ class OPTICS(BaseTransformer):
546
559
  print("\n".join(fields[1:]))
547
560
 
548
561
  session.file.get(
549
- os.path.join(stage_result_file_name, sproc_export_file_name),
562
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
550
563
  local_result_file_name,
551
564
  statement_params=statement_params
552
565
  )
@@ -592,7 +605,7 @@ class OPTICS(BaseTransformer):
592
605
 
593
606
  # Register vectorized UDF for batch inference
594
607
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
595
- safe_id=self.id, method=inference_method)
608
+ safe_id=self._get_rand_id(), method=inference_method)
596
609
 
597
610
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
598
611
  # will try to pickle all of self which fails.
@@ -684,7 +697,7 @@ class OPTICS(BaseTransformer):
684
697
  return transformed_pandas_df.to_dict("records")
685
698
 
686
699
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
687
- safe_id=self.id
700
+ safe_id=self._get_rand_id()
688
701
  )
689
702
 
690
703
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -740,26 +753,37 @@ class OPTICS(BaseTransformer):
740
753
  # input cols need to match unquoted / quoted
741
754
  input_cols = self.input_cols
742
755
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
756
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
743
757
 
744
758
  estimator = self._sklearn_object
745
759
 
746
- input_df = dataset[input_cols] # Select input columns with quoted column names.
747
- if hasattr(estimator, "feature_names_in_"):
748
- missing_features = []
749
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
750
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
751
- missing_features.append(f)
752
-
753
- if len(missing_features) > 0:
754
- raise ValueError(
755
- "The feature names should match with those that were passed during fit.\n"
756
- f"Features seen during fit call but not present in the input: {missing_features}\n"
757
- f"Features in the input dataframe : {input_cols}\n"
758
- )
759
- input_df.columns = getattr(estimator, "feature_names_in_")
760
- else:
761
- # Just rename the column names to unquoted identifiers.
762
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
760
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
761
+ missing_features = []
762
+ features_in_dataset = set(dataset.columns)
763
+ columns_to_select = []
764
+ for i, f in enumerate(features_required_by_estimator):
765
+ if (
766
+ i >= len(input_cols)
767
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
768
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
769
+ and quoted_input_cols[i] not in features_in_dataset)
770
+ ):
771
+ missing_features.append(f)
772
+ elif input_cols[i] in features_in_dataset:
773
+ columns_to_select.append(input_cols[i])
774
+ elif unquoted_input_cols[i] in features_in_dataset:
775
+ columns_to_select.append(unquoted_input_cols[i])
776
+ else:
777
+ columns_to_select.append(quoted_input_cols[i])
778
+
779
+ if len(missing_features) > 0:
780
+ raise ValueError(
781
+ "The feature names should match with those that were passed during fit.\n"
782
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
783
+ f"Features in the input dataframe : {input_cols}\n"
784
+ )
785
+ input_df = dataset[columns_to_select]
786
+ input_df.columns = features_required_by_estimator
763
787
 
764
788
  transformed_numpy_array = getattr(estimator, inference_method)(
765
789
  input_df
@@ -838,11 +862,18 @@ class OPTICS(BaseTransformer):
838
862
  Transformed dataset.
839
863
  """
840
864
  if isinstance(dataset, DataFrame):
865
+ expected_type_inferred = ""
866
+ # when it is classifier, infer the datatype from label columns
867
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
868
+ expected_type_inferred = convert_sp_to_sf_type(
869
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
870
+ )
871
+
841
872
  output_df = self._batch_inference(
842
873
  dataset=dataset,
843
874
  inference_method="predict",
844
875
  expected_output_cols_list=self.output_cols,
845
- expected_output_cols_type="",
876
+ expected_output_cols_type=expected_type_inferred,
846
877
  )
847
878
  elif isinstance(dataset, pd.DataFrame):
848
879
  output_df = self._sklearn_inference(
@@ -913,10 +944,10 @@ class OPTICS(BaseTransformer):
913
944
 
914
945
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
915
946
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
916
- Returns an empty list if current object is not a classifier or not yet fitted.
947
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
917
948
  """
918
949
  if getattr(self._sklearn_object, "classes_", None) is None:
919
- return []
950
+ return [output_cols_prefix]
920
951
 
921
952
  classes = self._sklearn_object.classes_
922
953
  if isinstance(classes, numpy.ndarray):
@@ -1141,7 +1172,7 @@ class OPTICS(BaseTransformer):
1141
1172
  cp.dump(self._sklearn_object, local_score_file)
1142
1173
 
1143
1174
  # Create temp stage to run score.
1144
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1175
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1145
1176
  session = dataset._session
1146
1177
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1147
1178
  SqlResultValidator(
@@ -1155,8 +1186,9 @@ class OPTICS(BaseTransformer):
1155
1186
  expected_value=f"Stage area {score_stage_name} successfully created."
1156
1187
  ).validate()
1157
1188
 
1158
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1159
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1189
+ # Use posixpath to construct stage paths
1190
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1191
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1160
1192
  statement_params = telemetry.get_function_usage_statement_params(
1161
1193
  project=_PROJECT,
1162
1194
  subproject=_SUBPROJECT,
@@ -1182,6 +1214,7 @@ class OPTICS(BaseTransformer):
1182
1214
  replace=True,
1183
1215
  session=session,
1184
1216
  statement_params=statement_params,
1217
+ anonymous=True
1185
1218
  )
1186
1219
  def score_wrapper_sproc(
1187
1220
  session: Session,
@@ -1189,7 +1222,8 @@ class OPTICS(BaseTransformer):
1189
1222
  stage_score_file_name: str,
1190
1223
  input_cols: List[str],
1191
1224
  label_cols: List[str],
1192
- sample_weight_col: Optional[str]
1225
+ sample_weight_col: Optional[str],
1226
+ statement_params: Dict[str, str]
1193
1227
  ) -> float:
1194
1228
  import cloudpickle as cp
1195
1229
  import numpy as np
@@ -1239,14 +1273,14 @@ class OPTICS(BaseTransformer):
1239
1273
  api_calls=[Session.call],
1240
1274
  custom_tags=dict([("autogen", True)]),
1241
1275
  )
1242
- score = session.call(
1243
- score_sproc_name,
1276
+ score = score_wrapper_sproc(
1277
+ session,
1244
1278
  query,
1245
1279
  stage_score_file_name,
1246
1280
  identifier.get_unescaped_names(self.input_cols),
1247
1281
  identifier.get_unescaped_names(self.label_cols),
1248
1282
  identifier.get_unescaped_names(self.sample_weight_col),
1249
- statement_params=statement_params,
1283
+ statement_params,
1250
1284
  )
1251
1285
 
1252
1286
  cleanup_temp_files([local_score_file_name])
@@ -1264,18 +1298,20 @@ class OPTICS(BaseTransformer):
1264
1298
  if self._sklearn_object._estimator_type == 'classifier':
1265
1299
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1266
1300
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1267
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1301
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1302
+ ([] if self._drop_input_cols else inputs) + outputs)
1268
1303
  # For regressor, the type of predict is float64
1269
1304
  elif self._sklearn_object._estimator_type == 'regressor':
1270
1305
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1271
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1272
-
1306
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1307
+ ([] if self._drop_input_cols else inputs) + outputs)
1273
1308
  for prob_func in PROB_FUNCTIONS:
1274
1309
  if hasattr(self, prob_func):
1275
1310
  output_cols_prefix: str = f"{prob_func}_"
1276
1311
  output_column_names = self._get_output_column_names(output_cols_prefix)
1277
1312
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1278
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1313
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1314
+ ([] if self._drop_input_cols else inputs) + outputs)
1279
1315
 
1280
1316
  @property
1281
1317
  def model_signatures(self) -> Dict[str, ModelSignature]: