snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -26,6 +27,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
26
27
  from snowflake.snowpark import DataFrame, Session
27
28
  from snowflake.snowpark.functions import pandas_udf, sproc
28
29
  from snowflake.snowpark.types import PandasSeries
30
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
29
31
 
30
32
  from snowflake.ml.model.model_signature import (
31
33
  DataType,
@@ -200,7 +202,6 @@ class LGBMClassifier(BaseTransformer):
200
202
  **kwargs,
201
203
  ) -> None:
202
204
  super().__init__()
203
- self.id = str(uuid4()).replace("-", "_").upper()
204
205
  deps: Set[str] = set([f'numpy=={np.__version__}', f'lightgbm=={lightgbm.__version__}', f'cloudpickle=={cp.__version__}'])
205
206
 
206
207
  self._deps = list(deps)
@@ -240,6 +241,15 @@ class LGBMClassifier(BaseTransformer):
240
241
  self.set_drop_input_cols(drop_input_cols)
241
242
  self.set_sample_weight_col(sample_weight_col)
242
243
 
244
+ def _get_rand_id(self) -> str:
245
+ """
246
+ Generate random id to be used in sproc and stage names.
247
+
248
+ Returns:
249
+ Random id string usable in sproc, table, and stage names.
250
+ """
251
+ return str(uuid4()).replace("-", "_").upper()
252
+
243
253
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
244
254
  """
245
255
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -318,7 +328,7 @@ class LGBMClassifier(BaseTransformer):
318
328
  cp.dump(self._sklearn_object, local_transform_file)
319
329
 
320
330
  # Create temp stage to run fit.
321
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
331
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
322
332
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
323
333
  SqlResultValidator(
324
334
  session=session,
@@ -331,11 +341,12 @@ class LGBMClassifier(BaseTransformer):
331
341
  expected_value=f"Stage area {transform_stage_name} successfully created."
332
342
  ).validate()
333
343
 
334
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
344
+ # Use posixpath to construct stage paths
345
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
346
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
335
347
  local_result_file_name = get_temp_file_path()
336
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
337
348
 
338
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
349
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
339
350
  statement_params = telemetry.get_function_usage_statement_params(
340
351
  project=_PROJECT,
341
352
  subproject=_SUBPROJECT,
@@ -361,6 +372,7 @@ class LGBMClassifier(BaseTransformer):
361
372
  replace=True,
362
373
  session=session,
363
374
  statement_params=statement_params,
375
+ anonymous=True
364
376
  )
365
377
  def fit_wrapper_sproc(
366
378
  session: Session,
@@ -369,7 +381,8 @@ class LGBMClassifier(BaseTransformer):
369
381
  stage_result_file_name: str,
370
382
  input_cols: List[str],
371
383
  label_cols: List[str],
372
- sample_weight_col: Optional[str]
384
+ sample_weight_col: Optional[str],
385
+ statement_params: Dict[str, str]
373
386
  ) -> str:
374
387
  import cloudpickle as cp
375
388
  import numpy as np
@@ -436,15 +449,15 @@ class LGBMClassifier(BaseTransformer):
436
449
  api_calls=[Session.call],
437
450
  custom_tags=dict([("autogen", True)]),
438
451
  )
439
- sproc_export_file_name = session.call(
440
- fit_sproc_name,
452
+ sproc_export_file_name = fit_wrapper_sproc(
453
+ session,
441
454
  query,
442
455
  stage_transform_file_name,
443
456
  stage_result_file_name,
444
457
  identifier.get_unescaped_names(self.input_cols),
445
458
  identifier.get_unescaped_names(self.label_cols),
446
459
  identifier.get_unescaped_names(self.sample_weight_col),
447
- statement_params=statement_params,
460
+ statement_params,
448
461
  )
449
462
 
450
463
  if "|" in sproc_export_file_name:
@@ -454,7 +467,7 @@ class LGBMClassifier(BaseTransformer):
454
467
  print("\n".join(fields[1:]))
455
468
 
456
469
  session.file.get(
457
- os.path.join(stage_result_file_name, sproc_export_file_name),
470
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
458
471
  local_result_file_name,
459
472
  statement_params=statement_params
460
473
  )
@@ -500,7 +513,7 @@ class LGBMClassifier(BaseTransformer):
500
513
 
501
514
  # Register vectorized UDF for batch inference
502
515
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
503
- safe_id=self.id, method=inference_method)
516
+ safe_id=self._get_rand_id(), method=inference_method)
504
517
 
505
518
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
506
519
  # will try to pickle all of self which fails.
@@ -592,7 +605,7 @@ class LGBMClassifier(BaseTransformer):
592
605
  return transformed_pandas_df.to_dict("records")
593
606
 
594
607
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
595
- safe_id=self.id
608
+ safe_id=self._get_rand_id()
596
609
  )
597
610
 
598
611
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -648,26 +661,37 @@ class LGBMClassifier(BaseTransformer):
648
661
  # input cols need to match unquoted / quoted
649
662
  input_cols = self.input_cols
650
663
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
664
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
651
665
 
652
666
  estimator = self._sklearn_object
653
667
 
654
- input_df = dataset[input_cols] # Select input columns with quoted column names.
655
- if hasattr(estimator, "feature_names_in_"):
656
- missing_features = []
657
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
658
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
659
- missing_features.append(f)
660
-
661
- if len(missing_features) > 0:
662
- raise ValueError(
663
- "The feature names should match with those that were passed during fit.\n"
664
- f"Features seen during fit call but not present in the input: {missing_features}\n"
665
- f"Features in the input dataframe : {input_cols}\n"
666
- )
667
- input_df.columns = getattr(estimator, "feature_names_in_")
668
- else:
669
- # Just rename the column names to unquoted identifiers.
670
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
668
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
669
+ missing_features = []
670
+ features_in_dataset = set(dataset.columns)
671
+ columns_to_select = []
672
+ for i, f in enumerate(features_required_by_estimator):
673
+ if (
674
+ i >= len(input_cols)
675
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
676
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
677
+ and quoted_input_cols[i] not in features_in_dataset)
678
+ ):
679
+ missing_features.append(f)
680
+ elif input_cols[i] in features_in_dataset:
681
+ columns_to_select.append(input_cols[i])
682
+ elif unquoted_input_cols[i] in features_in_dataset:
683
+ columns_to_select.append(unquoted_input_cols[i])
684
+ else:
685
+ columns_to_select.append(quoted_input_cols[i])
686
+
687
+ if len(missing_features) > 0:
688
+ raise ValueError(
689
+ "The feature names should match with those that were passed during fit.\n"
690
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
691
+ f"Features in the input dataframe : {input_cols}\n"
692
+ )
693
+ input_df = dataset[columns_to_select]
694
+ input_df.columns = features_required_by_estimator
671
695
 
672
696
  transformed_numpy_array = getattr(estimator, inference_method)(
673
697
  input_df
@@ -748,11 +772,18 @@ class LGBMClassifier(BaseTransformer):
748
772
  Transformed dataset.
749
773
  """
750
774
  if isinstance(dataset, DataFrame):
775
+ expected_type_inferred = ""
776
+ # when it is classifier, infer the datatype from label columns
777
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
778
+ expected_type_inferred = convert_sp_to_sf_type(
779
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
780
+ )
781
+
751
782
  output_df = self._batch_inference(
752
783
  dataset=dataset,
753
784
  inference_method="predict",
754
785
  expected_output_cols_list=self.output_cols,
755
- expected_output_cols_type="",
786
+ expected_output_cols_type=expected_type_inferred,
756
787
  )
757
788
  elif isinstance(dataset, pd.DataFrame):
758
789
  output_df = self._sklearn_inference(
@@ -823,10 +854,10 @@ class LGBMClassifier(BaseTransformer):
823
854
 
824
855
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
825
856
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
826
- Returns an empty list if current object is not a classifier or not yet fitted.
857
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
827
858
  """
828
859
  if getattr(self._sklearn_object, "classes_", None) is None:
829
- return []
860
+ return [output_cols_prefix]
830
861
 
831
862
  classes = self._sklearn_object.classes_
832
863
  if isinstance(classes, numpy.ndarray):
@@ -1055,7 +1086,7 @@ class LGBMClassifier(BaseTransformer):
1055
1086
  cp.dump(self._sklearn_object, local_score_file)
1056
1087
 
1057
1088
  # Create temp stage to run score.
1058
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1089
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1059
1090
  session = dataset._session
1060
1091
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1061
1092
  SqlResultValidator(
@@ -1069,8 +1100,9 @@ class LGBMClassifier(BaseTransformer):
1069
1100
  expected_value=f"Stage area {score_stage_name} successfully created."
1070
1101
  ).validate()
1071
1102
 
1072
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1073
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1103
+ # Use posixpath to construct stage paths
1104
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1105
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1074
1106
  statement_params = telemetry.get_function_usage_statement_params(
1075
1107
  project=_PROJECT,
1076
1108
  subproject=_SUBPROJECT,
@@ -1096,6 +1128,7 @@ class LGBMClassifier(BaseTransformer):
1096
1128
  replace=True,
1097
1129
  session=session,
1098
1130
  statement_params=statement_params,
1131
+ anonymous=True
1099
1132
  )
1100
1133
  def score_wrapper_sproc(
1101
1134
  session: Session,
@@ -1103,7 +1136,8 @@ class LGBMClassifier(BaseTransformer):
1103
1136
  stage_score_file_name: str,
1104
1137
  input_cols: List[str],
1105
1138
  label_cols: List[str],
1106
- sample_weight_col: Optional[str]
1139
+ sample_weight_col: Optional[str],
1140
+ statement_params: Dict[str, str]
1107
1141
  ) -> float:
1108
1142
  import cloudpickle as cp
1109
1143
  import numpy as np
@@ -1153,14 +1187,14 @@ class LGBMClassifier(BaseTransformer):
1153
1187
  api_calls=[Session.call],
1154
1188
  custom_tags=dict([("autogen", True)]),
1155
1189
  )
1156
- score = session.call(
1157
- score_sproc_name,
1190
+ score = score_wrapper_sproc(
1191
+ session,
1158
1192
  query,
1159
1193
  stage_score_file_name,
1160
1194
  identifier.get_unescaped_names(self.input_cols),
1161
1195
  identifier.get_unescaped_names(self.label_cols),
1162
1196
  identifier.get_unescaped_names(self.sample_weight_col),
1163
- statement_params=statement_params,
1197
+ statement_params,
1164
1198
  )
1165
1199
 
1166
1200
  cleanup_temp_files([local_score_file_name])
@@ -1178,18 +1212,20 @@ class LGBMClassifier(BaseTransformer):
1178
1212
  if self._sklearn_object._estimator_type == 'classifier':
1179
1213
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1180
1214
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1181
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1215
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1216
+ ([] if self._drop_input_cols else inputs) + outputs)
1182
1217
  # For regressor, the type of predict is float64
1183
1218
  elif self._sklearn_object._estimator_type == 'regressor':
1184
1219
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1185
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1186
-
1220
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1221
+ ([] if self._drop_input_cols else inputs) + outputs)
1187
1222
  for prob_func in PROB_FUNCTIONS:
1188
1223
  if hasattr(self, prob_func):
1189
1224
  output_cols_prefix: str = f"{prob_func}_"
1190
1225
  output_column_names = self._get_output_column_names(output_cols_prefix)
1191
1226
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1192
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1227
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1228
+ ([] if self._drop_input_cols else inputs) + outputs)
1193
1229
 
1194
1230
  @property
1195
1231
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -26,6 +27,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
26
27
  from snowflake.snowpark import DataFrame, Session
27
28
  from snowflake.snowpark.functions import pandas_udf, sproc
28
29
  from snowflake.snowpark.types import PandasSeries
30
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
29
31
 
30
32
  from snowflake.ml.model.model_signature import (
31
33
  DataType,
@@ -200,7 +202,6 @@ class LGBMRegressor(BaseTransformer):
200
202
  **kwargs,
201
203
  ) -> None:
202
204
  super().__init__()
203
- self.id = str(uuid4()).replace("-", "_").upper()
204
205
  deps: Set[str] = set([f'numpy=={np.__version__}', f'lightgbm=={lightgbm.__version__}', f'cloudpickle=={cp.__version__}'])
205
206
 
206
207
  self._deps = list(deps)
@@ -240,6 +241,15 @@ class LGBMRegressor(BaseTransformer):
240
241
  self.set_drop_input_cols(drop_input_cols)
241
242
  self.set_sample_weight_col(sample_weight_col)
242
243
 
244
+ def _get_rand_id(self) -> str:
245
+ """
246
+ Generate random id to be used in sproc and stage names.
247
+
248
+ Returns:
249
+ Random id string usable in sproc, table, and stage names.
250
+ """
251
+ return str(uuid4()).replace("-", "_").upper()
252
+
243
253
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
244
254
  """
245
255
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -318,7 +328,7 @@ class LGBMRegressor(BaseTransformer):
318
328
  cp.dump(self._sklearn_object, local_transform_file)
319
329
 
320
330
  # Create temp stage to run fit.
321
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
331
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
322
332
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
323
333
  SqlResultValidator(
324
334
  session=session,
@@ -331,11 +341,12 @@ class LGBMRegressor(BaseTransformer):
331
341
  expected_value=f"Stage area {transform_stage_name} successfully created."
332
342
  ).validate()
333
343
 
334
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
344
+ # Use posixpath to construct stage paths
345
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
346
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
335
347
  local_result_file_name = get_temp_file_path()
336
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
337
348
 
338
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
349
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
339
350
  statement_params = telemetry.get_function_usage_statement_params(
340
351
  project=_PROJECT,
341
352
  subproject=_SUBPROJECT,
@@ -361,6 +372,7 @@ class LGBMRegressor(BaseTransformer):
361
372
  replace=True,
362
373
  session=session,
363
374
  statement_params=statement_params,
375
+ anonymous=True
364
376
  )
365
377
  def fit_wrapper_sproc(
366
378
  session: Session,
@@ -369,7 +381,8 @@ class LGBMRegressor(BaseTransformer):
369
381
  stage_result_file_name: str,
370
382
  input_cols: List[str],
371
383
  label_cols: List[str],
372
- sample_weight_col: Optional[str]
384
+ sample_weight_col: Optional[str],
385
+ statement_params: Dict[str, str]
373
386
  ) -> str:
374
387
  import cloudpickle as cp
375
388
  import numpy as np
@@ -436,15 +449,15 @@ class LGBMRegressor(BaseTransformer):
436
449
  api_calls=[Session.call],
437
450
  custom_tags=dict([("autogen", True)]),
438
451
  )
439
- sproc_export_file_name = session.call(
440
- fit_sproc_name,
452
+ sproc_export_file_name = fit_wrapper_sproc(
453
+ session,
441
454
  query,
442
455
  stage_transform_file_name,
443
456
  stage_result_file_name,
444
457
  identifier.get_unescaped_names(self.input_cols),
445
458
  identifier.get_unescaped_names(self.label_cols),
446
459
  identifier.get_unescaped_names(self.sample_weight_col),
447
- statement_params=statement_params,
460
+ statement_params,
448
461
  )
449
462
 
450
463
  if "|" in sproc_export_file_name:
@@ -454,7 +467,7 @@ class LGBMRegressor(BaseTransformer):
454
467
  print("\n".join(fields[1:]))
455
468
 
456
469
  session.file.get(
457
- os.path.join(stage_result_file_name, sproc_export_file_name),
470
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
458
471
  local_result_file_name,
459
472
  statement_params=statement_params
460
473
  )
@@ -500,7 +513,7 @@ class LGBMRegressor(BaseTransformer):
500
513
 
501
514
  # Register vectorized UDF for batch inference
502
515
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
503
- safe_id=self.id, method=inference_method)
516
+ safe_id=self._get_rand_id(), method=inference_method)
504
517
 
505
518
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
506
519
  # will try to pickle all of self which fails.
@@ -592,7 +605,7 @@ class LGBMRegressor(BaseTransformer):
592
605
  return transformed_pandas_df.to_dict("records")
593
606
 
594
607
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
595
- safe_id=self.id
608
+ safe_id=self._get_rand_id()
596
609
  )
597
610
 
598
611
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -648,26 +661,37 @@ class LGBMRegressor(BaseTransformer):
648
661
  # input cols need to match unquoted / quoted
649
662
  input_cols = self.input_cols
650
663
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
664
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
651
665
 
652
666
  estimator = self._sklearn_object
653
667
 
654
- input_df = dataset[input_cols] # Select input columns with quoted column names.
655
- if hasattr(estimator, "feature_names_in_"):
656
- missing_features = []
657
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
658
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
659
- missing_features.append(f)
660
-
661
- if len(missing_features) > 0:
662
- raise ValueError(
663
- "The feature names should match with those that were passed during fit.\n"
664
- f"Features seen during fit call but not present in the input: {missing_features}\n"
665
- f"Features in the input dataframe : {input_cols}\n"
666
- )
667
- input_df.columns = getattr(estimator, "feature_names_in_")
668
- else:
669
- # Just rename the column names to unquoted identifiers.
670
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
668
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
669
+ missing_features = []
670
+ features_in_dataset = set(dataset.columns)
671
+ columns_to_select = []
672
+ for i, f in enumerate(features_required_by_estimator):
673
+ if (
674
+ i >= len(input_cols)
675
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
676
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
677
+ and quoted_input_cols[i] not in features_in_dataset)
678
+ ):
679
+ missing_features.append(f)
680
+ elif input_cols[i] in features_in_dataset:
681
+ columns_to_select.append(input_cols[i])
682
+ elif unquoted_input_cols[i] in features_in_dataset:
683
+ columns_to_select.append(unquoted_input_cols[i])
684
+ else:
685
+ columns_to_select.append(quoted_input_cols[i])
686
+
687
+ if len(missing_features) > 0:
688
+ raise ValueError(
689
+ "The feature names should match with those that were passed during fit.\n"
690
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
691
+ f"Features in the input dataframe : {input_cols}\n"
692
+ )
693
+ input_df = dataset[columns_to_select]
694
+ input_df.columns = features_required_by_estimator
671
695
 
672
696
  transformed_numpy_array = getattr(estimator, inference_method)(
673
697
  input_df
@@ -748,11 +772,18 @@ class LGBMRegressor(BaseTransformer):
748
772
  Transformed dataset.
749
773
  """
750
774
  if isinstance(dataset, DataFrame):
775
+ expected_type_inferred = "float"
776
+ # when it is classifier, infer the datatype from label columns
777
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
778
+ expected_type_inferred = convert_sp_to_sf_type(
779
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
780
+ )
781
+
751
782
  output_df = self._batch_inference(
752
783
  dataset=dataset,
753
784
  inference_method="predict",
754
785
  expected_output_cols_list=self.output_cols,
755
- expected_output_cols_type="float",
786
+ expected_output_cols_type=expected_type_inferred,
756
787
  )
757
788
  elif isinstance(dataset, pd.DataFrame):
758
789
  output_df = self._sklearn_inference(
@@ -823,10 +854,10 @@ class LGBMRegressor(BaseTransformer):
823
854
 
824
855
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
825
856
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
826
- Returns an empty list if current object is not a classifier or not yet fitted.
857
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
827
858
  """
828
859
  if getattr(self._sklearn_object, "classes_", None) is None:
829
- return []
860
+ return [output_cols_prefix]
830
861
 
831
862
  classes = self._sklearn_object.classes_
832
863
  if isinstance(classes, numpy.ndarray):
@@ -1051,7 +1082,7 @@ class LGBMRegressor(BaseTransformer):
1051
1082
  cp.dump(self._sklearn_object, local_score_file)
1052
1083
 
1053
1084
  # Create temp stage to run score.
1054
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1085
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1055
1086
  session = dataset._session
1056
1087
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1057
1088
  SqlResultValidator(
@@ -1065,8 +1096,9 @@ class LGBMRegressor(BaseTransformer):
1065
1096
  expected_value=f"Stage area {score_stage_name} successfully created."
1066
1097
  ).validate()
1067
1098
 
1068
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1069
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1099
+ # Use posixpath to construct stage paths
1100
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1101
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1070
1102
  statement_params = telemetry.get_function_usage_statement_params(
1071
1103
  project=_PROJECT,
1072
1104
  subproject=_SUBPROJECT,
@@ -1092,6 +1124,7 @@ class LGBMRegressor(BaseTransformer):
1092
1124
  replace=True,
1093
1125
  session=session,
1094
1126
  statement_params=statement_params,
1127
+ anonymous=True
1095
1128
  )
1096
1129
  def score_wrapper_sproc(
1097
1130
  session: Session,
@@ -1099,7 +1132,8 @@ class LGBMRegressor(BaseTransformer):
1099
1132
  stage_score_file_name: str,
1100
1133
  input_cols: List[str],
1101
1134
  label_cols: List[str],
1102
- sample_weight_col: Optional[str]
1135
+ sample_weight_col: Optional[str],
1136
+ statement_params: Dict[str, str]
1103
1137
  ) -> float:
1104
1138
  import cloudpickle as cp
1105
1139
  import numpy as np
@@ -1149,14 +1183,14 @@ class LGBMRegressor(BaseTransformer):
1149
1183
  api_calls=[Session.call],
1150
1184
  custom_tags=dict([("autogen", True)]),
1151
1185
  )
1152
- score = session.call(
1153
- score_sproc_name,
1186
+ score = score_wrapper_sproc(
1187
+ session,
1154
1188
  query,
1155
1189
  stage_score_file_name,
1156
1190
  identifier.get_unescaped_names(self.input_cols),
1157
1191
  identifier.get_unescaped_names(self.label_cols),
1158
1192
  identifier.get_unescaped_names(self.sample_weight_col),
1159
- statement_params=statement_params,
1193
+ statement_params,
1160
1194
  )
1161
1195
 
1162
1196
  cleanup_temp_files([local_score_file_name])
@@ -1174,18 +1208,20 @@ class LGBMRegressor(BaseTransformer):
1174
1208
  if self._sklearn_object._estimator_type == 'classifier':
1175
1209
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1176
1210
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1177
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1211
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1212
+ ([] if self._drop_input_cols else inputs) + outputs)
1178
1213
  # For regressor, the type of predict is float64
1179
1214
  elif self._sklearn_object._estimator_type == 'regressor':
1180
1215
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1181
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1182
-
1216
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1217
+ ([] if self._drop_input_cols else inputs) + outputs)
1183
1218
  for prob_func in PROB_FUNCTIONS:
1184
1219
  if hasattr(self, prob_func):
1185
1220
  output_cols_prefix: str = f"{prob_func}_"
1186
1221
  output_column_names = self._get_output_column_names(output_cols_prefix)
1187
1222
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1188
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1223
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1224
+ ([] if self._drop_input_cols else inputs) + outputs)
1189
1225
 
1190
1226
  @property
1191
1227
  def model_signatures(self) -> Dict[str, ModelSignature]: