snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -189,7 +191,6 @@ class GaussianNB(BaseTransformer):
189
191
  sample_weight_col: Optional[str] = None,
190
192
  ) -> None:
191
193
  super().__init__()
192
- self.id = str(uuid4()).replace("-", "_").upper()
193
194
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
194
195
 
195
196
  self._deps = list(deps)
@@ -210,6 +211,15 @@ class GaussianNB(BaseTransformer):
210
211
  self.set_drop_input_cols(drop_input_cols)
211
212
  self.set_sample_weight_col(sample_weight_col)
212
213
 
214
+ def _get_rand_id(self) -> str:
215
+ """
216
+ Generate random id to be used in sproc and stage names.
217
+
218
+ Returns:
219
+ Random id string usable in sproc, table, and stage names.
220
+ """
221
+ return str(uuid4()).replace("-", "_").upper()
222
+
213
223
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
214
224
  """
215
225
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -288,7 +298,7 @@ class GaussianNB(BaseTransformer):
288
298
  cp.dump(self._sklearn_object, local_transform_file)
289
299
 
290
300
  # Create temp stage to run fit.
291
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
301
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
292
302
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
293
303
  SqlResultValidator(
294
304
  session=session,
@@ -301,11 +311,12 @@ class GaussianNB(BaseTransformer):
301
311
  expected_value=f"Stage area {transform_stage_name} successfully created."
302
312
  ).validate()
303
313
 
304
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
314
+ # Use posixpath to construct stage paths
315
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
316
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
305
317
  local_result_file_name = get_temp_file_path()
306
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
307
318
 
308
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
319
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
309
320
  statement_params = telemetry.get_function_usage_statement_params(
310
321
  project=_PROJECT,
311
322
  subproject=_SUBPROJECT,
@@ -331,6 +342,7 @@ class GaussianNB(BaseTransformer):
331
342
  replace=True,
332
343
  session=session,
333
344
  statement_params=statement_params,
345
+ anonymous=True
334
346
  )
335
347
  def fit_wrapper_sproc(
336
348
  session: Session,
@@ -339,7 +351,8 @@ class GaussianNB(BaseTransformer):
339
351
  stage_result_file_name: str,
340
352
  input_cols: List[str],
341
353
  label_cols: List[str],
342
- sample_weight_col: Optional[str]
354
+ sample_weight_col: Optional[str],
355
+ statement_params: Dict[str, str]
343
356
  ) -> str:
344
357
  import cloudpickle as cp
345
358
  import numpy as np
@@ -406,15 +419,15 @@ class GaussianNB(BaseTransformer):
406
419
  api_calls=[Session.call],
407
420
  custom_tags=dict([("autogen", True)]),
408
421
  )
409
- sproc_export_file_name = session.call(
410
- fit_sproc_name,
422
+ sproc_export_file_name = fit_wrapper_sproc(
423
+ session,
411
424
  query,
412
425
  stage_transform_file_name,
413
426
  stage_result_file_name,
414
427
  identifier.get_unescaped_names(self.input_cols),
415
428
  identifier.get_unescaped_names(self.label_cols),
416
429
  identifier.get_unescaped_names(self.sample_weight_col),
417
- statement_params=statement_params,
430
+ statement_params,
418
431
  )
419
432
 
420
433
  if "|" in sproc_export_file_name:
@@ -424,7 +437,7 @@ class GaussianNB(BaseTransformer):
424
437
  print("\n".join(fields[1:]))
425
438
 
426
439
  session.file.get(
427
- os.path.join(stage_result_file_name, sproc_export_file_name),
440
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
428
441
  local_result_file_name,
429
442
  statement_params=statement_params
430
443
  )
@@ -470,7 +483,7 @@ class GaussianNB(BaseTransformer):
470
483
 
471
484
  # Register vectorized UDF for batch inference
472
485
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
473
- safe_id=self.id, method=inference_method)
486
+ safe_id=self._get_rand_id(), method=inference_method)
474
487
 
475
488
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
476
489
  # will try to pickle all of self which fails.
@@ -562,7 +575,7 @@ class GaussianNB(BaseTransformer):
562
575
  return transformed_pandas_df.to_dict("records")
563
576
 
564
577
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
565
- safe_id=self.id
578
+ safe_id=self._get_rand_id()
566
579
  )
567
580
 
568
581
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -618,26 +631,37 @@ class GaussianNB(BaseTransformer):
618
631
  # input cols need to match unquoted / quoted
619
632
  input_cols = self.input_cols
620
633
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
634
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
621
635
 
622
636
  estimator = self._sklearn_object
623
637
 
624
- input_df = dataset[input_cols] # Select input columns with quoted column names.
625
- if hasattr(estimator, "feature_names_in_"):
626
- missing_features = []
627
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
628
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
629
- missing_features.append(f)
630
-
631
- if len(missing_features) > 0:
632
- raise ValueError(
633
- "The feature names should match with those that were passed during fit.\n"
634
- f"Features seen during fit call but not present in the input: {missing_features}\n"
635
- f"Features in the input dataframe : {input_cols}\n"
636
- )
637
- input_df.columns = getattr(estimator, "feature_names_in_")
638
- else:
639
- # Just rename the column names to unquoted identifiers.
640
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
638
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
639
+ missing_features = []
640
+ features_in_dataset = set(dataset.columns)
641
+ columns_to_select = []
642
+ for i, f in enumerate(features_required_by_estimator):
643
+ if (
644
+ i >= len(input_cols)
645
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
646
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
647
+ and quoted_input_cols[i] not in features_in_dataset)
648
+ ):
649
+ missing_features.append(f)
650
+ elif input_cols[i] in features_in_dataset:
651
+ columns_to_select.append(input_cols[i])
652
+ elif unquoted_input_cols[i] in features_in_dataset:
653
+ columns_to_select.append(unquoted_input_cols[i])
654
+ else:
655
+ columns_to_select.append(quoted_input_cols[i])
656
+
657
+ if len(missing_features) > 0:
658
+ raise ValueError(
659
+ "The feature names should match with those that were passed during fit.\n"
660
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
661
+ f"Features in the input dataframe : {input_cols}\n"
662
+ )
663
+ input_df = dataset[columns_to_select]
664
+ input_df.columns = features_required_by_estimator
641
665
 
642
666
  transformed_numpy_array = getattr(estimator, inference_method)(
643
667
  input_df
@@ -718,11 +742,18 @@ class GaussianNB(BaseTransformer):
718
742
  Transformed dataset.
719
743
  """
720
744
  if isinstance(dataset, DataFrame):
745
+ expected_type_inferred = ""
746
+ # when it is classifier, infer the datatype from label columns
747
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
748
+ expected_type_inferred = convert_sp_to_sf_type(
749
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
750
+ )
751
+
721
752
  output_df = self._batch_inference(
722
753
  dataset=dataset,
723
754
  inference_method="predict",
724
755
  expected_output_cols_list=self.output_cols,
725
- expected_output_cols_type="",
756
+ expected_output_cols_type=expected_type_inferred,
726
757
  )
727
758
  elif isinstance(dataset, pd.DataFrame):
728
759
  output_df = self._sklearn_inference(
@@ -793,10 +824,10 @@ class GaussianNB(BaseTransformer):
793
824
 
794
825
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
795
826
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
796
- Returns an empty list if current object is not a classifier or not yet fitted.
827
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
797
828
  """
798
829
  if getattr(self._sklearn_object, "classes_", None) is None:
799
- return []
830
+ return [output_cols_prefix]
800
831
 
801
832
  classes = self._sklearn_object.classes_
802
833
  if isinstance(classes, numpy.ndarray):
@@ -1025,7 +1056,7 @@ class GaussianNB(BaseTransformer):
1025
1056
  cp.dump(self._sklearn_object, local_score_file)
1026
1057
 
1027
1058
  # Create temp stage to run score.
1028
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1059
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1029
1060
  session = dataset._session
1030
1061
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1031
1062
  SqlResultValidator(
@@ -1039,8 +1070,9 @@ class GaussianNB(BaseTransformer):
1039
1070
  expected_value=f"Stage area {score_stage_name} successfully created."
1040
1071
  ).validate()
1041
1072
 
1042
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1043
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1073
+ # Use posixpath to construct stage paths
1074
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1075
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1044
1076
  statement_params = telemetry.get_function_usage_statement_params(
1045
1077
  project=_PROJECT,
1046
1078
  subproject=_SUBPROJECT,
@@ -1066,6 +1098,7 @@ class GaussianNB(BaseTransformer):
1066
1098
  replace=True,
1067
1099
  session=session,
1068
1100
  statement_params=statement_params,
1101
+ anonymous=True
1069
1102
  )
1070
1103
  def score_wrapper_sproc(
1071
1104
  session: Session,
@@ -1073,7 +1106,8 @@ class GaussianNB(BaseTransformer):
1073
1106
  stage_score_file_name: str,
1074
1107
  input_cols: List[str],
1075
1108
  label_cols: List[str],
1076
- sample_weight_col: Optional[str]
1109
+ sample_weight_col: Optional[str],
1110
+ statement_params: Dict[str, str]
1077
1111
  ) -> float:
1078
1112
  import cloudpickle as cp
1079
1113
  import numpy as np
@@ -1123,14 +1157,14 @@ class GaussianNB(BaseTransformer):
1123
1157
  api_calls=[Session.call],
1124
1158
  custom_tags=dict([("autogen", True)]),
1125
1159
  )
1126
- score = session.call(
1127
- score_sproc_name,
1160
+ score = score_wrapper_sproc(
1161
+ session,
1128
1162
  query,
1129
1163
  stage_score_file_name,
1130
1164
  identifier.get_unescaped_names(self.input_cols),
1131
1165
  identifier.get_unescaped_names(self.label_cols),
1132
1166
  identifier.get_unescaped_names(self.sample_weight_col),
1133
- statement_params=statement_params,
1167
+ statement_params,
1134
1168
  )
1135
1169
 
1136
1170
  cleanup_temp_files([local_score_file_name])
@@ -1148,18 +1182,20 @@ class GaussianNB(BaseTransformer):
1148
1182
  if self._sklearn_object._estimator_type == 'classifier':
1149
1183
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1150
1184
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1151
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1185
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1186
+ ([] if self._drop_input_cols else inputs) + outputs)
1152
1187
  # For regressor, the type of predict is float64
1153
1188
  elif self._sklearn_object._estimator_type == 'regressor':
1154
1189
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1155
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1156
-
1190
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1191
+ ([] if self._drop_input_cols else inputs) + outputs)
1157
1192
  for prob_func in PROB_FUNCTIONS:
1158
1193
  if hasattr(self, prob_func):
1159
1194
  output_cols_prefix: str = f"{prob_func}_"
1160
1195
  output_column_names = self._get_output_column_names(output_cols_prefix)
1161
1196
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1162
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1197
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1198
+ ([] if self._drop_input_cols else inputs) + outputs)
1163
1199
 
1164
1200
  @property
1165
1201
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -200,7 +202,6 @@ class MultinomialNB(BaseTransformer):
200
202
  sample_weight_col: Optional[str] = None,
201
203
  ) -> None:
202
204
  super().__init__()
203
- self.id = str(uuid4()).replace("-", "_").upper()
204
205
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
205
206
 
206
207
  self._deps = list(deps)
@@ -223,6 +224,15 @@ class MultinomialNB(BaseTransformer):
223
224
  self.set_drop_input_cols(drop_input_cols)
224
225
  self.set_sample_weight_col(sample_weight_col)
225
226
 
227
+ def _get_rand_id(self) -> str:
228
+ """
229
+ Generate random id to be used in sproc and stage names.
230
+
231
+ Returns:
232
+ Random id string usable in sproc, table, and stage names.
233
+ """
234
+ return str(uuid4()).replace("-", "_").upper()
235
+
226
236
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
227
237
  """
228
238
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -301,7 +311,7 @@ class MultinomialNB(BaseTransformer):
301
311
  cp.dump(self._sklearn_object, local_transform_file)
302
312
 
303
313
  # Create temp stage to run fit.
304
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
314
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
305
315
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
306
316
  SqlResultValidator(
307
317
  session=session,
@@ -314,11 +324,12 @@ class MultinomialNB(BaseTransformer):
314
324
  expected_value=f"Stage area {transform_stage_name} successfully created."
315
325
  ).validate()
316
326
 
317
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
327
+ # Use posixpath to construct stage paths
328
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
329
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
318
330
  local_result_file_name = get_temp_file_path()
319
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
320
331
 
321
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
332
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
322
333
  statement_params = telemetry.get_function_usage_statement_params(
323
334
  project=_PROJECT,
324
335
  subproject=_SUBPROJECT,
@@ -344,6 +355,7 @@ class MultinomialNB(BaseTransformer):
344
355
  replace=True,
345
356
  session=session,
346
357
  statement_params=statement_params,
358
+ anonymous=True
347
359
  )
348
360
  def fit_wrapper_sproc(
349
361
  session: Session,
@@ -352,7 +364,8 @@ class MultinomialNB(BaseTransformer):
352
364
  stage_result_file_name: str,
353
365
  input_cols: List[str],
354
366
  label_cols: List[str],
355
- sample_weight_col: Optional[str]
367
+ sample_weight_col: Optional[str],
368
+ statement_params: Dict[str, str]
356
369
  ) -> str:
357
370
  import cloudpickle as cp
358
371
  import numpy as np
@@ -419,15 +432,15 @@ class MultinomialNB(BaseTransformer):
419
432
  api_calls=[Session.call],
420
433
  custom_tags=dict([("autogen", True)]),
421
434
  )
422
- sproc_export_file_name = session.call(
423
- fit_sproc_name,
435
+ sproc_export_file_name = fit_wrapper_sproc(
436
+ session,
424
437
  query,
425
438
  stage_transform_file_name,
426
439
  stage_result_file_name,
427
440
  identifier.get_unescaped_names(self.input_cols),
428
441
  identifier.get_unescaped_names(self.label_cols),
429
442
  identifier.get_unescaped_names(self.sample_weight_col),
430
- statement_params=statement_params,
443
+ statement_params,
431
444
  )
432
445
 
433
446
  if "|" in sproc_export_file_name:
@@ -437,7 +450,7 @@ class MultinomialNB(BaseTransformer):
437
450
  print("\n".join(fields[1:]))
438
451
 
439
452
  session.file.get(
440
- os.path.join(stage_result_file_name, sproc_export_file_name),
453
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
441
454
  local_result_file_name,
442
455
  statement_params=statement_params
443
456
  )
@@ -483,7 +496,7 @@ class MultinomialNB(BaseTransformer):
483
496
 
484
497
  # Register vectorized UDF for batch inference
485
498
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
486
- safe_id=self.id, method=inference_method)
499
+ safe_id=self._get_rand_id(), method=inference_method)
487
500
 
488
501
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
489
502
  # will try to pickle all of self which fails.
@@ -575,7 +588,7 @@ class MultinomialNB(BaseTransformer):
575
588
  return transformed_pandas_df.to_dict("records")
576
589
 
577
590
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
578
- safe_id=self.id
591
+ safe_id=self._get_rand_id()
579
592
  )
580
593
 
581
594
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -631,26 +644,37 @@ class MultinomialNB(BaseTransformer):
631
644
  # input cols need to match unquoted / quoted
632
645
  input_cols = self.input_cols
633
646
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
647
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
634
648
 
635
649
  estimator = self._sklearn_object
636
650
 
637
- input_df = dataset[input_cols] # Select input columns with quoted column names.
638
- if hasattr(estimator, "feature_names_in_"):
639
- missing_features = []
640
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
641
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
642
- missing_features.append(f)
643
-
644
- if len(missing_features) > 0:
645
- raise ValueError(
646
- "The feature names should match with those that were passed during fit.\n"
647
- f"Features seen during fit call but not present in the input: {missing_features}\n"
648
- f"Features in the input dataframe : {input_cols}\n"
649
- )
650
- input_df.columns = getattr(estimator, "feature_names_in_")
651
- else:
652
- # Just rename the column names to unquoted identifiers.
653
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
651
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
652
+ missing_features = []
653
+ features_in_dataset = set(dataset.columns)
654
+ columns_to_select = []
655
+ for i, f in enumerate(features_required_by_estimator):
656
+ if (
657
+ i >= len(input_cols)
658
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
659
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
660
+ and quoted_input_cols[i] not in features_in_dataset)
661
+ ):
662
+ missing_features.append(f)
663
+ elif input_cols[i] in features_in_dataset:
664
+ columns_to_select.append(input_cols[i])
665
+ elif unquoted_input_cols[i] in features_in_dataset:
666
+ columns_to_select.append(unquoted_input_cols[i])
667
+ else:
668
+ columns_to_select.append(quoted_input_cols[i])
669
+
670
+ if len(missing_features) > 0:
671
+ raise ValueError(
672
+ "The feature names should match with those that were passed during fit.\n"
673
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
674
+ f"Features in the input dataframe : {input_cols}\n"
675
+ )
676
+ input_df = dataset[columns_to_select]
677
+ input_df.columns = features_required_by_estimator
654
678
 
655
679
  transformed_numpy_array = getattr(estimator, inference_method)(
656
680
  input_df
@@ -731,11 +755,18 @@ class MultinomialNB(BaseTransformer):
731
755
  Transformed dataset.
732
756
  """
733
757
  if isinstance(dataset, DataFrame):
758
+ expected_type_inferred = ""
759
+ # when it is classifier, infer the datatype from label columns
760
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
761
+ expected_type_inferred = convert_sp_to_sf_type(
762
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
763
+ )
764
+
734
765
  output_df = self._batch_inference(
735
766
  dataset=dataset,
736
767
  inference_method="predict",
737
768
  expected_output_cols_list=self.output_cols,
738
- expected_output_cols_type="",
769
+ expected_output_cols_type=expected_type_inferred,
739
770
  )
740
771
  elif isinstance(dataset, pd.DataFrame):
741
772
  output_df = self._sklearn_inference(
@@ -806,10 +837,10 @@ class MultinomialNB(BaseTransformer):
806
837
 
807
838
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
808
839
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
809
- Returns an empty list if current object is not a classifier or not yet fitted.
840
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
810
841
  """
811
842
  if getattr(self._sklearn_object, "classes_", None) is None:
812
- return []
843
+ return [output_cols_prefix]
813
844
 
814
845
  classes = self._sklearn_object.classes_
815
846
  if isinstance(classes, numpy.ndarray):
@@ -1038,7 +1069,7 @@ class MultinomialNB(BaseTransformer):
1038
1069
  cp.dump(self._sklearn_object, local_score_file)
1039
1070
 
1040
1071
  # Create temp stage to run score.
1041
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1072
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1042
1073
  session = dataset._session
1043
1074
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1044
1075
  SqlResultValidator(
@@ -1052,8 +1083,9 @@ class MultinomialNB(BaseTransformer):
1052
1083
  expected_value=f"Stage area {score_stage_name} successfully created."
1053
1084
  ).validate()
1054
1085
 
1055
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1056
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1086
+ # Use posixpath to construct stage paths
1087
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1088
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1057
1089
  statement_params = telemetry.get_function_usage_statement_params(
1058
1090
  project=_PROJECT,
1059
1091
  subproject=_SUBPROJECT,
@@ -1079,6 +1111,7 @@ class MultinomialNB(BaseTransformer):
1079
1111
  replace=True,
1080
1112
  session=session,
1081
1113
  statement_params=statement_params,
1114
+ anonymous=True
1082
1115
  )
1083
1116
  def score_wrapper_sproc(
1084
1117
  session: Session,
@@ -1086,7 +1119,8 @@ class MultinomialNB(BaseTransformer):
1086
1119
  stage_score_file_name: str,
1087
1120
  input_cols: List[str],
1088
1121
  label_cols: List[str],
1089
- sample_weight_col: Optional[str]
1122
+ sample_weight_col: Optional[str],
1123
+ statement_params: Dict[str, str]
1090
1124
  ) -> float:
1091
1125
  import cloudpickle as cp
1092
1126
  import numpy as np
@@ -1136,14 +1170,14 @@ class MultinomialNB(BaseTransformer):
1136
1170
  api_calls=[Session.call],
1137
1171
  custom_tags=dict([("autogen", True)]),
1138
1172
  )
1139
- score = session.call(
1140
- score_sproc_name,
1173
+ score = score_wrapper_sproc(
1174
+ session,
1141
1175
  query,
1142
1176
  stage_score_file_name,
1143
1177
  identifier.get_unescaped_names(self.input_cols),
1144
1178
  identifier.get_unescaped_names(self.label_cols),
1145
1179
  identifier.get_unescaped_names(self.sample_weight_col),
1146
- statement_params=statement_params,
1180
+ statement_params,
1147
1181
  )
1148
1182
 
1149
1183
  cleanup_temp_files([local_score_file_name])
@@ -1161,18 +1195,20 @@ class MultinomialNB(BaseTransformer):
1161
1195
  if self._sklearn_object._estimator_type == 'classifier':
1162
1196
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1163
1197
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1164
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1198
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1199
+ ([] if self._drop_input_cols else inputs) + outputs)
1165
1200
  # For regressor, the type of predict is float64
1166
1201
  elif self._sklearn_object._estimator_type == 'regressor':
1167
1202
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1168
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1169
-
1203
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1204
+ ([] if self._drop_input_cols else inputs) + outputs)
1170
1205
  for prob_func in PROB_FUNCTIONS:
1171
1206
  if hasattr(self, prob_func):
1172
1207
  output_cols_prefix: str = f"{prob_func}_"
1173
1208
  output_column_names = self._get_output_column_names(output_cols_prefix)
1174
1209
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1175
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1210
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1211
+ ([] if self._drop_input_cols else inputs) + outputs)
1176
1212
 
1177
1213
  @property
1178
1214
  def model_signatures(self) -> Dict[str, ModelSignature]: