snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -240,7 +242,6 @@ class Lasso(BaseTransformer):
240
242
  sample_weight_col: Optional[str] = None,
241
243
  ) -> None:
242
244
  super().__init__()
243
- self.id = str(uuid4()).replace("-", "_").upper()
244
245
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
245
246
 
246
247
  self._deps = list(deps)
@@ -269,6 +270,15 @@ class Lasso(BaseTransformer):
269
270
  self.set_drop_input_cols(drop_input_cols)
270
271
  self.set_sample_weight_col(sample_weight_col)
271
272
 
273
+ def _get_rand_id(self) -> str:
274
+ """
275
+ Generate random id to be used in sproc and stage names.
276
+
277
+ Returns:
278
+ Random id string usable in sproc, table, and stage names.
279
+ """
280
+ return str(uuid4()).replace("-", "_").upper()
281
+
272
282
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
273
283
  """
274
284
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -347,7 +357,7 @@ class Lasso(BaseTransformer):
347
357
  cp.dump(self._sklearn_object, local_transform_file)
348
358
 
349
359
  # Create temp stage to run fit.
350
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
360
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
351
361
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
352
362
  SqlResultValidator(
353
363
  session=session,
@@ -360,11 +370,12 @@ class Lasso(BaseTransformer):
360
370
  expected_value=f"Stage area {transform_stage_name} successfully created."
361
371
  ).validate()
362
372
 
363
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
373
+ # Use posixpath to construct stage paths
374
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
375
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
364
376
  local_result_file_name = get_temp_file_path()
365
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
366
377
 
367
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
378
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
368
379
  statement_params = telemetry.get_function_usage_statement_params(
369
380
  project=_PROJECT,
370
381
  subproject=_SUBPROJECT,
@@ -390,6 +401,7 @@ class Lasso(BaseTransformer):
390
401
  replace=True,
391
402
  session=session,
392
403
  statement_params=statement_params,
404
+ anonymous=True
393
405
  )
394
406
  def fit_wrapper_sproc(
395
407
  session: Session,
@@ -398,7 +410,8 @@ class Lasso(BaseTransformer):
398
410
  stage_result_file_name: str,
399
411
  input_cols: List[str],
400
412
  label_cols: List[str],
401
- sample_weight_col: Optional[str]
413
+ sample_weight_col: Optional[str],
414
+ statement_params: Dict[str, str]
402
415
  ) -> str:
403
416
  import cloudpickle as cp
404
417
  import numpy as np
@@ -465,15 +478,15 @@ class Lasso(BaseTransformer):
465
478
  api_calls=[Session.call],
466
479
  custom_tags=dict([("autogen", True)]),
467
480
  )
468
- sproc_export_file_name = session.call(
469
- fit_sproc_name,
481
+ sproc_export_file_name = fit_wrapper_sproc(
482
+ session,
470
483
  query,
471
484
  stage_transform_file_name,
472
485
  stage_result_file_name,
473
486
  identifier.get_unescaped_names(self.input_cols),
474
487
  identifier.get_unescaped_names(self.label_cols),
475
488
  identifier.get_unescaped_names(self.sample_weight_col),
476
- statement_params=statement_params,
489
+ statement_params,
477
490
  )
478
491
 
479
492
  if "|" in sproc_export_file_name:
@@ -483,7 +496,7 @@ class Lasso(BaseTransformer):
483
496
  print("\n".join(fields[1:]))
484
497
 
485
498
  session.file.get(
486
- os.path.join(stage_result_file_name, sproc_export_file_name),
499
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
487
500
  local_result_file_name,
488
501
  statement_params=statement_params
489
502
  )
@@ -529,7 +542,7 @@ class Lasso(BaseTransformer):
529
542
 
530
543
  # Register vectorized UDF for batch inference
531
544
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
532
- safe_id=self.id, method=inference_method)
545
+ safe_id=self._get_rand_id(), method=inference_method)
533
546
 
534
547
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
535
548
  # will try to pickle all of self which fails.
@@ -621,7 +634,7 @@ class Lasso(BaseTransformer):
621
634
  return transformed_pandas_df.to_dict("records")
622
635
 
623
636
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
624
- safe_id=self.id
637
+ safe_id=self._get_rand_id()
625
638
  )
626
639
 
627
640
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -677,26 +690,37 @@ class Lasso(BaseTransformer):
677
690
  # input cols need to match unquoted / quoted
678
691
  input_cols = self.input_cols
679
692
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
693
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
680
694
 
681
695
  estimator = self._sklearn_object
682
696
 
683
- input_df = dataset[input_cols] # Select input columns with quoted column names.
684
- if hasattr(estimator, "feature_names_in_"):
685
- missing_features = []
686
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
687
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
688
- missing_features.append(f)
689
-
690
- if len(missing_features) > 0:
691
- raise ValueError(
692
- "The feature names should match with those that were passed during fit.\n"
693
- f"Features seen during fit call but not present in the input: {missing_features}\n"
694
- f"Features in the input dataframe : {input_cols}\n"
695
- )
696
- input_df.columns = getattr(estimator, "feature_names_in_")
697
- else:
698
- # Just rename the column names to unquoted identifiers.
699
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
697
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
698
+ missing_features = []
699
+ features_in_dataset = set(dataset.columns)
700
+ columns_to_select = []
701
+ for i, f in enumerate(features_required_by_estimator):
702
+ if (
703
+ i >= len(input_cols)
704
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
705
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
706
+ and quoted_input_cols[i] not in features_in_dataset)
707
+ ):
708
+ missing_features.append(f)
709
+ elif input_cols[i] in features_in_dataset:
710
+ columns_to_select.append(input_cols[i])
711
+ elif unquoted_input_cols[i] in features_in_dataset:
712
+ columns_to_select.append(unquoted_input_cols[i])
713
+ else:
714
+ columns_to_select.append(quoted_input_cols[i])
715
+
716
+ if len(missing_features) > 0:
717
+ raise ValueError(
718
+ "The feature names should match with those that were passed during fit.\n"
719
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
720
+ f"Features in the input dataframe : {input_cols}\n"
721
+ )
722
+ input_df = dataset[columns_to_select]
723
+ input_df.columns = features_required_by_estimator
700
724
 
701
725
  transformed_numpy_array = getattr(estimator, inference_method)(
702
726
  input_df
@@ -777,11 +801,18 @@ class Lasso(BaseTransformer):
777
801
  Transformed dataset.
778
802
  """
779
803
  if isinstance(dataset, DataFrame):
804
+ expected_type_inferred = "float"
805
+ # when it is classifier, infer the datatype from label columns
806
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
807
+ expected_type_inferred = convert_sp_to_sf_type(
808
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
809
+ )
810
+
780
811
  output_df = self._batch_inference(
781
812
  dataset=dataset,
782
813
  inference_method="predict",
783
814
  expected_output_cols_list=self.output_cols,
784
- expected_output_cols_type="float",
815
+ expected_output_cols_type=expected_type_inferred,
785
816
  )
786
817
  elif isinstance(dataset, pd.DataFrame):
787
818
  output_df = self._sklearn_inference(
@@ -852,10 +883,10 @@ class Lasso(BaseTransformer):
852
883
 
853
884
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
854
885
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
855
- Returns an empty list if current object is not a classifier or not yet fitted.
886
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
856
887
  """
857
888
  if getattr(self._sklearn_object, "classes_", None) is None:
858
- return []
889
+ return [output_cols_prefix]
859
890
 
860
891
  classes = self._sklearn_object.classes_
861
892
  if isinstance(classes, numpy.ndarray):
@@ -1080,7 +1111,7 @@ class Lasso(BaseTransformer):
1080
1111
  cp.dump(self._sklearn_object, local_score_file)
1081
1112
 
1082
1113
  # Create temp stage to run score.
1083
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1114
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1084
1115
  session = dataset._session
1085
1116
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1086
1117
  SqlResultValidator(
@@ -1094,8 +1125,9 @@ class Lasso(BaseTransformer):
1094
1125
  expected_value=f"Stage area {score_stage_name} successfully created."
1095
1126
  ).validate()
1096
1127
 
1097
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1098
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1128
+ # Use posixpath to construct stage paths
1129
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1130
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1099
1131
  statement_params = telemetry.get_function_usage_statement_params(
1100
1132
  project=_PROJECT,
1101
1133
  subproject=_SUBPROJECT,
@@ -1121,6 +1153,7 @@ class Lasso(BaseTransformer):
1121
1153
  replace=True,
1122
1154
  session=session,
1123
1155
  statement_params=statement_params,
1156
+ anonymous=True
1124
1157
  )
1125
1158
  def score_wrapper_sproc(
1126
1159
  session: Session,
@@ -1128,7 +1161,8 @@ class Lasso(BaseTransformer):
1128
1161
  stage_score_file_name: str,
1129
1162
  input_cols: List[str],
1130
1163
  label_cols: List[str],
1131
- sample_weight_col: Optional[str]
1164
+ sample_weight_col: Optional[str],
1165
+ statement_params: Dict[str, str]
1132
1166
  ) -> float:
1133
1167
  import cloudpickle as cp
1134
1168
  import numpy as np
@@ -1178,14 +1212,14 @@ class Lasso(BaseTransformer):
1178
1212
  api_calls=[Session.call],
1179
1213
  custom_tags=dict([("autogen", True)]),
1180
1214
  )
1181
- score = session.call(
1182
- score_sproc_name,
1215
+ score = score_wrapper_sproc(
1216
+ session,
1183
1217
  query,
1184
1218
  stage_score_file_name,
1185
1219
  identifier.get_unescaped_names(self.input_cols),
1186
1220
  identifier.get_unescaped_names(self.label_cols),
1187
1221
  identifier.get_unescaped_names(self.sample_weight_col),
1188
- statement_params=statement_params,
1222
+ statement_params,
1189
1223
  )
1190
1224
 
1191
1225
  cleanup_temp_files([local_score_file_name])
@@ -1203,18 +1237,20 @@ class Lasso(BaseTransformer):
1203
1237
  if self._sklearn_object._estimator_type == 'classifier':
1204
1238
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1205
1239
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1206
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1240
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1241
+ ([] if self._drop_input_cols else inputs) + outputs)
1207
1242
  # For regressor, the type of predict is float64
1208
1243
  elif self._sklearn_object._estimator_type == 'regressor':
1209
1244
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1210
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1211
-
1245
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1246
+ ([] if self._drop_input_cols else inputs) + outputs)
1212
1247
  for prob_func in PROB_FUNCTIONS:
1213
1248
  if hasattr(self, prob_func):
1214
1249
  output_cols_prefix: str = f"{prob_func}_"
1215
1250
  output_column_names = self._get_output_column_names(output_cols_prefix)
1216
1251
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1217
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1252
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1253
+ ([] if self._drop_input_cols else inputs) + outputs)
1218
1254
 
1219
1255
  @property
1220
1256
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -264,7 +266,6 @@ class LassoCV(BaseTransformer):
264
266
  sample_weight_col: Optional[str] = None,
265
267
  ) -> None:
266
268
  super().__init__()
267
- self.id = str(uuid4()).replace("-", "_").upper()
268
269
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
269
270
 
270
271
  self._deps = list(deps)
@@ -297,6 +298,15 @@ class LassoCV(BaseTransformer):
297
298
  self.set_drop_input_cols(drop_input_cols)
298
299
  self.set_sample_weight_col(sample_weight_col)
299
300
 
301
+ def _get_rand_id(self) -> str:
302
+ """
303
+ Generate random id to be used in sproc and stage names.
304
+
305
+ Returns:
306
+ Random id string usable in sproc, table, and stage names.
307
+ """
308
+ return str(uuid4()).replace("-", "_").upper()
309
+
300
310
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
301
311
  """
302
312
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -375,7 +385,7 @@ class LassoCV(BaseTransformer):
375
385
  cp.dump(self._sklearn_object, local_transform_file)
376
386
 
377
387
  # Create temp stage to run fit.
378
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
388
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
379
389
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
380
390
  SqlResultValidator(
381
391
  session=session,
@@ -388,11 +398,12 @@ class LassoCV(BaseTransformer):
388
398
  expected_value=f"Stage area {transform_stage_name} successfully created."
389
399
  ).validate()
390
400
 
391
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
401
+ # Use posixpath to construct stage paths
402
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
403
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
392
404
  local_result_file_name = get_temp_file_path()
393
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
394
405
 
395
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
406
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
396
407
  statement_params = telemetry.get_function_usage_statement_params(
397
408
  project=_PROJECT,
398
409
  subproject=_SUBPROJECT,
@@ -418,6 +429,7 @@ class LassoCV(BaseTransformer):
418
429
  replace=True,
419
430
  session=session,
420
431
  statement_params=statement_params,
432
+ anonymous=True
421
433
  )
422
434
  def fit_wrapper_sproc(
423
435
  session: Session,
@@ -426,7 +438,8 @@ class LassoCV(BaseTransformer):
426
438
  stage_result_file_name: str,
427
439
  input_cols: List[str],
428
440
  label_cols: List[str],
429
- sample_weight_col: Optional[str]
441
+ sample_weight_col: Optional[str],
442
+ statement_params: Dict[str, str]
430
443
  ) -> str:
431
444
  import cloudpickle as cp
432
445
  import numpy as np
@@ -493,15 +506,15 @@ class LassoCV(BaseTransformer):
493
506
  api_calls=[Session.call],
494
507
  custom_tags=dict([("autogen", True)]),
495
508
  )
496
- sproc_export_file_name = session.call(
497
- fit_sproc_name,
509
+ sproc_export_file_name = fit_wrapper_sproc(
510
+ session,
498
511
  query,
499
512
  stage_transform_file_name,
500
513
  stage_result_file_name,
501
514
  identifier.get_unescaped_names(self.input_cols),
502
515
  identifier.get_unescaped_names(self.label_cols),
503
516
  identifier.get_unescaped_names(self.sample_weight_col),
504
- statement_params=statement_params,
517
+ statement_params,
505
518
  )
506
519
 
507
520
  if "|" in sproc_export_file_name:
@@ -511,7 +524,7 @@ class LassoCV(BaseTransformer):
511
524
  print("\n".join(fields[1:]))
512
525
 
513
526
  session.file.get(
514
- os.path.join(stage_result_file_name, sproc_export_file_name),
527
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
515
528
  local_result_file_name,
516
529
  statement_params=statement_params
517
530
  )
@@ -557,7 +570,7 @@ class LassoCV(BaseTransformer):
557
570
 
558
571
  # Register vectorized UDF for batch inference
559
572
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
560
- safe_id=self.id, method=inference_method)
573
+ safe_id=self._get_rand_id(), method=inference_method)
561
574
 
562
575
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
563
576
  # will try to pickle all of self which fails.
@@ -649,7 +662,7 @@ class LassoCV(BaseTransformer):
649
662
  return transformed_pandas_df.to_dict("records")
650
663
 
651
664
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
652
- safe_id=self.id
665
+ safe_id=self._get_rand_id()
653
666
  )
654
667
 
655
668
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -705,26 +718,37 @@ class LassoCV(BaseTransformer):
705
718
  # input cols need to match unquoted / quoted
706
719
  input_cols = self.input_cols
707
720
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
721
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
708
722
 
709
723
  estimator = self._sklearn_object
710
724
 
711
- input_df = dataset[input_cols] # Select input columns with quoted column names.
712
- if hasattr(estimator, "feature_names_in_"):
713
- missing_features = []
714
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
715
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
716
- missing_features.append(f)
717
-
718
- if len(missing_features) > 0:
719
- raise ValueError(
720
- "The feature names should match with those that were passed during fit.\n"
721
- f"Features seen during fit call but not present in the input: {missing_features}\n"
722
- f"Features in the input dataframe : {input_cols}\n"
723
- )
724
- input_df.columns = getattr(estimator, "feature_names_in_")
725
- else:
726
- # Just rename the column names to unquoted identifiers.
727
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
725
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
726
+ missing_features = []
727
+ features_in_dataset = set(dataset.columns)
728
+ columns_to_select = []
729
+ for i, f in enumerate(features_required_by_estimator):
730
+ if (
731
+ i >= len(input_cols)
732
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
733
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
734
+ and quoted_input_cols[i] not in features_in_dataset)
735
+ ):
736
+ missing_features.append(f)
737
+ elif input_cols[i] in features_in_dataset:
738
+ columns_to_select.append(input_cols[i])
739
+ elif unquoted_input_cols[i] in features_in_dataset:
740
+ columns_to_select.append(unquoted_input_cols[i])
741
+ else:
742
+ columns_to_select.append(quoted_input_cols[i])
743
+
744
+ if len(missing_features) > 0:
745
+ raise ValueError(
746
+ "The feature names should match with those that were passed during fit.\n"
747
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
748
+ f"Features in the input dataframe : {input_cols}\n"
749
+ )
750
+ input_df = dataset[columns_to_select]
751
+ input_df.columns = features_required_by_estimator
728
752
 
729
753
  transformed_numpy_array = getattr(estimator, inference_method)(
730
754
  input_df
@@ -805,11 +829,18 @@ class LassoCV(BaseTransformer):
805
829
  Transformed dataset.
806
830
  """
807
831
  if isinstance(dataset, DataFrame):
832
+ expected_type_inferred = "float"
833
+ # when it is classifier, infer the datatype from label columns
834
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
835
+ expected_type_inferred = convert_sp_to_sf_type(
836
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
837
+ )
838
+
808
839
  output_df = self._batch_inference(
809
840
  dataset=dataset,
810
841
  inference_method="predict",
811
842
  expected_output_cols_list=self.output_cols,
812
- expected_output_cols_type="float",
843
+ expected_output_cols_type=expected_type_inferred,
813
844
  )
814
845
  elif isinstance(dataset, pd.DataFrame):
815
846
  output_df = self._sklearn_inference(
@@ -880,10 +911,10 @@ class LassoCV(BaseTransformer):
880
911
 
881
912
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
882
913
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
883
- Returns an empty list if current object is not a classifier or not yet fitted.
914
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
884
915
  """
885
916
  if getattr(self._sklearn_object, "classes_", None) is None:
886
- return []
917
+ return [output_cols_prefix]
887
918
 
888
919
  classes = self._sklearn_object.classes_
889
920
  if isinstance(classes, numpy.ndarray):
@@ -1108,7 +1139,7 @@ class LassoCV(BaseTransformer):
1108
1139
  cp.dump(self._sklearn_object, local_score_file)
1109
1140
 
1110
1141
  # Create temp stage to run score.
1111
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1142
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1112
1143
  session = dataset._session
1113
1144
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1114
1145
  SqlResultValidator(
@@ -1122,8 +1153,9 @@ class LassoCV(BaseTransformer):
1122
1153
  expected_value=f"Stage area {score_stage_name} successfully created."
1123
1154
  ).validate()
1124
1155
 
1125
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1126
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1156
+ # Use posixpath to construct stage paths
1157
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1158
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1127
1159
  statement_params = telemetry.get_function_usage_statement_params(
1128
1160
  project=_PROJECT,
1129
1161
  subproject=_SUBPROJECT,
@@ -1149,6 +1181,7 @@ class LassoCV(BaseTransformer):
1149
1181
  replace=True,
1150
1182
  session=session,
1151
1183
  statement_params=statement_params,
1184
+ anonymous=True
1152
1185
  )
1153
1186
  def score_wrapper_sproc(
1154
1187
  session: Session,
@@ -1156,7 +1189,8 @@ class LassoCV(BaseTransformer):
1156
1189
  stage_score_file_name: str,
1157
1190
  input_cols: List[str],
1158
1191
  label_cols: List[str],
1159
- sample_weight_col: Optional[str]
1192
+ sample_weight_col: Optional[str],
1193
+ statement_params: Dict[str, str]
1160
1194
  ) -> float:
1161
1195
  import cloudpickle as cp
1162
1196
  import numpy as np
@@ -1206,14 +1240,14 @@ class LassoCV(BaseTransformer):
1206
1240
  api_calls=[Session.call],
1207
1241
  custom_tags=dict([("autogen", True)]),
1208
1242
  )
1209
- score = session.call(
1210
- score_sproc_name,
1243
+ score = score_wrapper_sproc(
1244
+ session,
1211
1245
  query,
1212
1246
  stage_score_file_name,
1213
1247
  identifier.get_unescaped_names(self.input_cols),
1214
1248
  identifier.get_unescaped_names(self.label_cols),
1215
1249
  identifier.get_unescaped_names(self.sample_weight_col),
1216
- statement_params=statement_params,
1250
+ statement_params,
1217
1251
  )
1218
1252
 
1219
1253
  cleanup_temp_files([local_score_file_name])
@@ -1231,18 +1265,20 @@ class LassoCV(BaseTransformer):
1231
1265
  if self._sklearn_object._estimator_type == 'classifier':
1232
1266
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1233
1267
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1234
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1268
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1269
+ ([] if self._drop_input_cols else inputs) + outputs)
1235
1270
  # For regressor, the type of predict is float64
1236
1271
  elif self._sklearn_object._estimator_type == 'regressor':
1237
1272
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1238
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1239
-
1273
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1274
+ ([] if self._drop_input_cols else inputs) + outputs)
1240
1275
  for prob_func in PROB_FUNCTIONS:
1241
1276
  if hasattr(self, prob_func):
1242
1277
  output_cols_prefix: str = f"{prob_func}_"
1243
1278
  output_column_names = self._get_output_column_names(output_cols_prefix)
1244
1279
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1245
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1280
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1281
+ ([] if self._drop_input_cols else inputs) + outputs)
1246
1282
 
1247
1283
  @property
1248
1284
  def model_signatures(self) -> Dict[str, ModelSignature]: