snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -206,7 +208,6 @@ class OutputCodeClassifier(BaseTransformer):
206
208
  sample_weight_col: Optional[str] = None,
207
209
  ) -> None:
208
210
  super().__init__()
209
- self.id = str(uuid4()).replace("-", "_").upper()
210
211
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
211
212
  deps = deps | _gather_dependencies(estimator)
212
213
  self._deps = list(deps)
@@ -229,6 +230,15 @@ class OutputCodeClassifier(BaseTransformer):
229
230
  self.set_drop_input_cols(drop_input_cols)
230
231
  self.set_sample_weight_col(sample_weight_col)
231
232
 
233
+ def _get_rand_id(self) -> str:
234
+ """
235
+ Generate random id to be used in sproc and stage names.
236
+
237
+ Returns:
238
+ Random id string usable in sproc, table, and stage names.
239
+ """
240
+ return str(uuid4()).replace("-", "_").upper()
241
+
232
242
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
233
243
  """
234
244
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -307,7 +317,7 @@ class OutputCodeClassifier(BaseTransformer):
307
317
  cp.dump(self._sklearn_object, local_transform_file)
308
318
 
309
319
  # Create temp stage to run fit.
310
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
320
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
311
321
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
312
322
  SqlResultValidator(
313
323
  session=session,
@@ -320,11 +330,12 @@ class OutputCodeClassifier(BaseTransformer):
320
330
  expected_value=f"Stage area {transform_stage_name} successfully created."
321
331
  ).validate()
322
332
 
323
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
333
+ # Use posixpath to construct stage paths
334
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
335
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
324
336
  local_result_file_name = get_temp_file_path()
325
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
326
337
 
327
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
338
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
328
339
  statement_params = telemetry.get_function_usage_statement_params(
329
340
  project=_PROJECT,
330
341
  subproject=_SUBPROJECT,
@@ -350,6 +361,7 @@ class OutputCodeClassifier(BaseTransformer):
350
361
  replace=True,
351
362
  session=session,
352
363
  statement_params=statement_params,
364
+ anonymous=True
353
365
  )
354
366
  def fit_wrapper_sproc(
355
367
  session: Session,
@@ -358,7 +370,8 @@ class OutputCodeClassifier(BaseTransformer):
358
370
  stage_result_file_name: str,
359
371
  input_cols: List[str],
360
372
  label_cols: List[str],
361
- sample_weight_col: Optional[str]
373
+ sample_weight_col: Optional[str],
374
+ statement_params: Dict[str, str]
362
375
  ) -> str:
363
376
  import cloudpickle as cp
364
377
  import numpy as np
@@ -425,15 +438,15 @@ class OutputCodeClassifier(BaseTransformer):
425
438
  api_calls=[Session.call],
426
439
  custom_tags=dict([("autogen", True)]),
427
440
  )
428
- sproc_export_file_name = session.call(
429
- fit_sproc_name,
441
+ sproc_export_file_name = fit_wrapper_sproc(
442
+ session,
430
443
  query,
431
444
  stage_transform_file_name,
432
445
  stage_result_file_name,
433
446
  identifier.get_unescaped_names(self.input_cols),
434
447
  identifier.get_unescaped_names(self.label_cols),
435
448
  identifier.get_unescaped_names(self.sample_weight_col),
436
- statement_params=statement_params,
449
+ statement_params,
437
450
  )
438
451
 
439
452
  if "|" in sproc_export_file_name:
@@ -443,7 +456,7 @@ class OutputCodeClassifier(BaseTransformer):
443
456
  print("\n".join(fields[1:]))
444
457
 
445
458
  session.file.get(
446
- os.path.join(stage_result_file_name, sproc_export_file_name),
459
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
447
460
  local_result_file_name,
448
461
  statement_params=statement_params
449
462
  )
@@ -489,7 +502,7 @@ class OutputCodeClassifier(BaseTransformer):
489
502
 
490
503
  # Register vectorized UDF for batch inference
491
504
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
492
- safe_id=self.id, method=inference_method)
505
+ safe_id=self._get_rand_id(), method=inference_method)
493
506
 
494
507
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
495
508
  # will try to pickle all of self which fails.
@@ -581,7 +594,7 @@ class OutputCodeClassifier(BaseTransformer):
581
594
  return transformed_pandas_df.to_dict("records")
582
595
 
583
596
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
584
- safe_id=self.id
597
+ safe_id=self._get_rand_id()
585
598
  )
586
599
 
587
600
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -637,26 +650,37 @@ class OutputCodeClassifier(BaseTransformer):
637
650
  # input cols need to match unquoted / quoted
638
651
  input_cols = self.input_cols
639
652
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
653
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
640
654
 
641
655
  estimator = self._sklearn_object
642
656
 
643
- input_df = dataset[input_cols] # Select input columns with quoted column names.
644
- if hasattr(estimator, "feature_names_in_"):
645
- missing_features = []
646
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
647
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
648
- missing_features.append(f)
649
-
650
- if len(missing_features) > 0:
651
- raise ValueError(
652
- "The feature names should match with those that were passed during fit.\n"
653
- f"Features seen during fit call but not present in the input: {missing_features}\n"
654
- f"Features in the input dataframe : {input_cols}\n"
655
- )
656
- input_df.columns = getattr(estimator, "feature_names_in_")
657
- else:
658
- # Just rename the column names to unquoted identifiers.
659
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
657
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
658
+ missing_features = []
659
+ features_in_dataset = set(dataset.columns)
660
+ columns_to_select = []
661
+ for i, f in enumerate(features_required_by_estimator):
662
+ if (
663
+ i >= len(input_cols)
664
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
665
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
666
+ and quoted_input_cols[i] not in features_in_dataset)
667
+ ):
668
+ missing_features.append(f)
669
+ elif input_cols[i] in features_in_dataset:
670
+ columns_to_select.append(input_cols[i])
671
+ elif unquoted_input_cols[i] in features_in_dataset:
672
+ columns_to_select.append(unquoted_input_cols[i])
673
+ else:
674
+ columns_to_select.append(quoted_input_cols[i])
675
+
676
+ if len(missing_features) > 0:
677
+ raise ValueError(
678
+ "The feature names should match with those that were passed during fit.\n"
679
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
680
+ f"Features in the input dataframe : {input_cols}\n"
681
+ )
682
+ input_df = dataset[columns_to_select]
683
+ input_df.columns = features_required_by_estimator
660
684
 
661
685
  transformed_numpy_array = getattr(estimator, inference_method)(
662
686
  input_df
@@ -737,11 +761,18 @@ class OutputCodeClassifier(BaseTransformer):
737
761
  Transformed dataset.
738
762
  """
739
763
  if isinstance(dataset, DataFrame):
764
+ expected_type_inferred = ""
765
+ # when it is classifier, infer the datatype from label columns
766
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
767
+ expected_type_inferred = convert_sp_to_sf_type(
768
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
769
+ )
770
+
740
771
  output_df = self._batch_inference(
741
772
  dataset=dataset,
742
773
  inference_method="predict",
743
774
  expected_output_cols_list=self.output_cols,
744
- expected_output_cols_type="",
775
+ expected_output_cols_type=expected_type_inferred,
745
776
  )
746
777
  elif isinstance(dataset, pd.DataFrame):
747
778
  output_df = self._sklearn_inference(
@@ -812,10 +843,10 @@ class OutputCodeClassifier(BaseTransformer):
812
843
 
813
844
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
814
845
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
815
- Returns an empty list if current object is not a classifier or not yet fitted.
846
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
816
847
  """
817
848
  if getattr(self._sklearn_object, "classes_", None) is None:
818
- return []
849
+ return [output_cols_prefix]
819
850
 
820
851
  classes = self._sklearn_object.classes_
821
852
  if isinstance(classes, numpy.ndarray):
@@ -1040,7 +1071,7 @@ class OutputCodeClassifier(BaseTransformer):
1040
1071
  cp.dump(self._sklearn_object, local_score_file)
1041
1072
 
1042
1073
  # Create temp stage to run score.
1043
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1074
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1044
1075
  session = dataset._session
1045
1076
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1046
1077
  SqlResultValidator(
@@ -1054,8 +1085,9 @@ class OutputCodeClassifier(BaseTransformer):
1054
1085
  expected_value=f"Stage area {score_stage_name} successfully created."
1055
1086
  ).validate()
1056
1087
 
1057
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1058
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1088
+ # Use posixpath to construct stage paths
1089
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1090
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1059
1091
  statement_params = telemetry.get_function_usage_statement_params(
1060
1092
  project=_PROJECT,
1061
1093
  subproject=_SUBPROJECT,
@@ -1081,6 +1113,7 @@ class OutputCodeClassifier(BaseTransformer):
1081
1113
  replace=True,
1082
1114
  session=session,
1083
1115
  statement_params=statement_params,
1116
+ anonymous=True
1084
1117
  )
1085
1118
  def score_wrapper_sproc(
1086
1119
  session: Session,
@@ -1088,7 +1121,8 @@ class OutputCodeClassifier(BaseTransformer):
1088
1121
  stage_score_file_name: str,
1089
1122
  input_cols: List[str],
1090
1123
  label_cols: List[str],
1091
- sample_weight_col: Optional[str]
1124
+ sample_weight_col: Optional[str],
1125
+ statement_params: Dict[str, str]
1092
1126
  ) -> float:
1093
1127
  import cloudpickle as cp
1094
1128
  import numpy as np
@@ -1138,14 +1172,14 @@ class OutputCodeClassifier(BaseTransformer):
1138
1172
  api_calls=[Session.call],
1139
1173
  custom_tags=dict([("autogen", True)]),
1140
1174
  )
1141
- score = session.call(
1142
- score_sproc_name,
1175
+ score = score_wrapper_sproc(
1176
+ session,
1143
1177
  query,
1144
1178
  stage_score_file_name,
1145
1179
  identifier.get_unescaped_names(self.input_cols),
1146
1180
  identifier.get_unescaped_names(self.label_cols),
1147
1181
  identifier.get_unescaped_names(self.sample_weight_col),
1148
- statement_params=statement_params,
1182
+ statement_params,
1149
1183
  )
1150
1184
 
1151
1185
  cleanup_temp_files([local_score_file_name])
@@ -1163,18 +1197,20 @@ class OutputCodeClassifier(BaseTransformer):
1163
1197
  if self._sklearn_object._estimator_type == 'classifier':
1164
1198
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1165
1199
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1166
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1200
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1201
+ ([] if self._drop_input_cols else inputs) + outputs)
1167
1202
  # For regressor, the type of predict is float64
1168
1203
  elif self._sklearn_object._estimator_type == 'regressor':
1169
1204
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1170
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1171
-
1205
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1206
+ ([] if self._drop_input_cols else inputs) + outputs)
1172
1207
  for prob_func in PROB_FUNCTIONS:
1173
1208
  if hasattr(self, prob_func):
1174
1209
  output_cols_prefix: str = f"{prob_func}_"
1175
1210
  output_column_names = self._get_output_column_names(output_cols_prefix)
1176
1211
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1177
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1212
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1213
+ ([] if self._drop_input_cols else inputs) + outputs)
1178
1214
 
1179
1215
  @property
1180
1216
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -205,7 +207,6 @@ class BernoulliNB(BaseTransformer):
205
207
  sample_weight_col: Optional[str] = None,
206
208
  ) -> None:
207
209
  super().__init__()
208
- self.id = str(uuid4()).replace("-", "_").upper()
209
210
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
210
211
 
211
212
  self._deps = list(deps)
@@ -229,6 +230,15 @@ class BernoulliNB(BaseTransformer):
229
230
  self.set_drop_input_cols(drop_input_cols)
230
231
  self.set_sample_weight_col(sample_weight_col)
231
232
 
233
+ def _get_rand_id(self) -> str:
234
+ """
235
+ Generate random id to be used in sproc and stage names.
236
+
237
+ Returns:
238
+ Random id string usable in sproc, table, and stage names.
239
+ """
240
+ return str(uuid4()).replace("-", "_").upper()
241
+
232
242
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
233
243
  """
234
244
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -307,7 +317,7 @@ class BernoulliNB(BaseTransformer):
307
317
  cp.dump(self._sklearn_object, local_transform_file)
308
318
 
309
319
  # Create temp stage to run fit.
310
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
320
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
311
321
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
312
322
  SqlResultValidator(
313
323
  session=session,
@@ -320,11 +330,12 @@ class BernoulliNB(BaseTransformer):
320
330
  expected_value=f"Stage area {transform_stage_name} successfully created."
321
331
  ).validate()
322
332
 
323
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
333
+ # Use posixpath to construct stage paths
334
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
335
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
324
336
  local_result_file_name = get_temp_file_path()
325
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
326
337
 
327
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
338
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
328
339
  statement_params = telemetry.get_function_usage_statement_params(
329
340
  project=_PROJECT,
330
341
  subproject=_SUBPROJECT,
@@ -350,6 +361,7 @@ class BernoulliNB(BaseTransformer):
350
361
  replace=True,
351
362
  session=session,
352
363
  statement_params=statement_params,
364
+ anonymous=True
353
365
  )
354
366
  def fit_wrapper_sproc(
355
367
  session: Session,
@@ -358,7 +370,8 @@ class BernoulliNB(BaseTransformer):
358
370
  stage_result_file_name: str,
359
371
  input_cols: List[str],
360
372
  label_cols: List[str],
361
- sample_weight_col: Optional[str]
373
+ sample_weight_col: Optional[str],
374
+ statement_params: Dict[str, str]
362
375
  ) -> str:
363
376
  import cloudpickle as cp
364
377
  import numpy as np
@@ -425,15 +438,15 @@ class BernoulliNB(BaseTransformer):
425
438
  api_calls=[Session.call],
426
439
  custom_tags=dict([("autogen", True)]),
427
440
  )
428
- sproc_export_file_name = session.call(
429
- fit_sproc_name,
441
+ sproc_export_file_name = fit_wrapper_sproc(
442
+ session,
430
443
  query,
431
444
  stage_transform_file_name,
432
445
  stage_result_file_name,
433
446
  identifier.get_unescaped_names(self.input_cols),
434
447
  identifier.get_unescaped_names(self.label_cols),
435
448
  identifier.get_unescaped_names(self.sample_weight_col),
436
- statement_params=statement_params,
449
+ statement_params,
437
450
  )
438
451
 
439
452
  if "|" in sproc_export_file_name:
@@ -443,7 +456,7 @@ class BernoulliNB(BaseTransformer):
443
456
  print("\n".join(fields[1:]))
444
457
 
445
458
  session.file.get(
446
- os.path.join(stage_result_file_name, sproc_export_file_name),
459
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
447
460
  local_result_file_name,
448
461
  statement_params=statement_params
449
462
  )
@@ -489,7 +502,7 @@ class BernoulliNB(BaseTransformer):
489
502
 
490
503
  # Register vectorized UDF for batch inference
491
504
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
492
- safe_id=self.id, method=inference_method)
505
+ safe_id=self._get_rand_id(), method=inference_method)
493
506
 
494
507
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
495
508
  # will try to pickle all of self which fails.
@@ -581,7 +594,7 @@ class BernoulliNB(BaseTransformer):
581
594
  return transformed_pandas_df.to_dict("records")
582
595
 
583
596
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
584
- safe_id=self.id
597
+ safe_id=self._get_rand_id()
585
598
  )
586
599
 
587
600
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -637,26 +650,37 @@ class BernoulliNB(BaseTransformer):
637
650
  # input cols need to match unquoted / quoted
638
651
  input_cols = self.input_cols
639
652
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
653
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
640
654
 
641
655
  estimator = self._sklearn_object
642
656
 
643
- input_df = dataset[input_cols] # Select input columns with quoted column names.
644
- if hasattr(estimator, "feature_names_in_"):
645
- missing_features = []
646
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
647
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
648
- missing_features.append(f)
649
-
650
- if len(missing_features) > 0:
651
- raise ValueError(
652
- "The feature names should match with those that were passed during fit.\n"
653
- f"Features seen during fit call but not present in the input: {missing_features}\n"
654
- f"Features in the input dataframe : {input_cols}\n"
655
- )
656
- input_df.columns = getattr(estimator, "feature_names_in_")
657
- else:
658
- # Just rename the column names to unquoted identifiers.
659
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
657
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
658
+ missing_features = []
659
+ features_in_dataset = set(dataset.columns)
660
+ columns_to_select = []
661
+ for i, f in enumerate(features_required_by_estimator):
662
+ if (
663
+ i >= len(input_cols)
664
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
665
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
666
+ and quoted_input_cols[i] not in features_in_dataset)
667
+ ):
668
+ missing_features.append(f)
669
+ elif input_cols[i] in features_in_dataset:
670
+ columns_to_select.append(input_cols[i])
671
+ elif unquoted_input_cols[i] in features_in_dataset:
672
+ columns_to_select.append(unquoted_input_cols[i])
673
+ else:
674
+ columns_to_select.append(quoted_input_cols[i])
675
+
676
+ if len(missing_features) > 0:
677
+ raise ValueError(
678
+ "The feature names should match with those that were passed during fit.\n"
679
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
680
+ f"Features in the input dataframe : {input_cols}\n"
681
+ )
682
+ input_df = dataset[columns_to_select]
683
+ input_df.columns = features_required_by_estimator
660
684
 
661
685
  transformed_numpy_array = getattr(estimator, inference_method)(
662
686
  input_df
@@ -737,11 +761,18 @@ class BernoulliNB(BaseTransformer):
737
761
  Transformed dataset.
738
762
  """
739
763
  if isinstance(dataset, DataFrame):
764
+ expected_type_inferred = ""
765
+ # when it is classifier, infer the datatype from label columns
766
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
767
+ expected_type_inferred = convert_sp_to_sf_type(
768
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
769
+ )
770
+
740
771
  output_df = self._batch_inference(
741
772
  dataset=dataset,
742
773
  inference_method="predict",
743
774
  expected_output_cols_list=self.output_cols,
744
- expected_output_cols_type="",
775
+ expected_output_cols_type=expected_type_inferred,
745
776
  )
746
777
  elif isinstance(dataset, pd.DataFrame):
747
778
  output_df = self._sklearn_inference(
@@ -812,10 +843,10 @@ class BernoulliNB(BaseTransformer):
812
843
 
813
844
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
814
845
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
815
- Returns an empty list if current object is not a classifier or not yet fitted.
846
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
816
847
  """
817
848
  if getattr(self._sklearn_object, "classes_", None) is None:
818
- return []
849
+ return [output_cols_prefix]
819
850
 
820
851
  classes = self._sklearn_object.classes_
821
852
  if isinstance(classes, numpy.ndarray):
@@ -1044,7 +1075,7 @@ class BernoulliNB(BaseTransformer):
1044
1075
  cp.dump(self._sklearn_object, local_score_file)
1045
1076
 
1046
1077
  # Create temp stage to run score.
1047
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1078
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1048
1079
  session = dataset._session
1049
1080
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1050
1081
  SqlResultValidator(
@@ -1058,8 +1089,9 @@ class BernoulliNB(BaseTransformer):
1058
1089
  expected_value=f"Stage area {score_stage_name} successfully created."
1059
1090
  ).validate()
1060
1091
 
1061
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1062
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1092
+ # Use posixpath to construct stage paths
1093
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1094
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1063
1095
  statement_params = telemetry.get_function_usage_statement_params(
1064
1096
  project=_PROJECT,
1065
1097
  subproject=_SUBPROJECT,
@@ -1085,6 +1117,7 @@ class BernoulliNB(BaseTransformer):
1085
1117
  replace=True,
1086
1118
  session=session,
1087
1119
  statement_params=statement_params,
1120
+ anonymous=True
1088
1121
  )
1089
1122
  def score_wrapper_sproc(
1090
1123
  session: Session,
@@ -1092,7 +1125,8 @@ class BernoulliNB(BaseTransformer):
1092
1125
  stage_score_file_name: str,
1093
1126
  input_cols: List[str],
1094
1127
  label_cols: List[str],
1095
- sample_weight_col: Optional[str]
1128
+ sample_weight_col: Optional[str],
1129
+ statement_params: Dict[str, str]
1096
1130
  ) -> float:
1097
1131
  import cloudpickle as cp
1098
1132
  import numpy as np
@@ -1142,14 +1176,14 @@ class BernoulliNB(BaseTransformer):
1142
1176
  api_calls=[Session.call],
1143
1177
  custom_tags=dict([("autogen", True)]),
1144
1178
  )
1145
- score = session.call(
1146
- score_sproc_name,
1179
+ score = score_wrapper_sproc(
1180
+ session,
1147
1181
  query,
1148
1182
  stage_score_file_name,
1149
1183
  identifier.get_unescaped_names(self.input_cols),
1150
1184
  identifier.get_unescaped_names(self.label_cols),
1151
1185
  identifier.get_unescaped_names(self.sample_weight_col),
1152
- statement_params=statement_params,
1186
+ statement_params,
1153
1187
  )
1154
1188
 
1155
1189
  cleanup_temp_files([local_score_file_name])
@@ -1167,18 +1201,20 @@ class BernoulliNB(BaseTransformer):
1167
1201
  if self._sklearn_object._estimator_type == 'classifier':
1168
1202
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1169
1203
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1170
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1204
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1205
+ ([] if self._drop_input_cols else inputs) + outputs)
1171
1206
  # For regressor, the type of predict is float64
1172
1207
  elif self._sklearn_object._estimator_type == 'regressor':
1173
1208
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1174
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1175
-
1209
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1210
+ ([] if self._drop_input_cols else inputs) + outputs)
1176
1211
  for prob_func in PROB_FUNCTIONS:
1177
1212
  if hasattr(self, prob_func):
1178
1213
  output_cols_prefix: str = f"{prob_func}_"
1179
1214
  output_column_names = self._get_output_column_names(output_cols_prefix)
1180
1215
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1181
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1216
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1217
+ ([] if self._drop_input_cols else inputs) + outputs)
1182
1218
 
1183
1219
  @property
1184
1220
  def model_signatures(self) -> Dict[str, ModelSignature]: