snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -239,7 +241,6 @@ class Lars(BaseTransformer):
239
241
  sample_weight_col: Optional[str] = None,
240
242
  ) -> None:
241
243
  super().__init__()
242
- self.id = str(uuid4()).replace("-", "_").upper()
243
244
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
244
245
 
245
246
  self._deps = list(deps)
@@ -268,6 +269,15 @@ class Lars(BaseTransformer):
268
269
  self.set_drop_input_cols(drop_input_cols)
269
270
  self.set_sample_weight_col(sample_weight_col)
270
271
 
272
+ def _get_rand_id(self) -> str:
273
+ """
274
+ Generate random id to be used in sproc and stage names.
275
+
276
+ Returns:
277
+ Random id string usable in sproc, table, and stage names.
278
+ """
279
+ return str(uuid4()).replace("-", "_").upper()
280
+
271
281
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
272
282
  """
273
283
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -346,7 +356,7 @@ class Lars(BaseTransformer):
346
356
  cp.dump(self._sklearn_object, local_transform_file)
347
357
 
348
358
  # Create temp stage to run fit.
349
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
359
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
350
360
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
351
361
  SqlResultValidator(
352
362
  session=session,
@@ -359,11 +369,12 @@ class Lars(BaseTransformer):
359
369
  expected_value=f"Stage area {transform_stage_name} successfully created."
360
370
  ).validate()
361
371
 
362
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
372
+ # Use posixpath to construct stage paths
373
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
374
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
363
375
  local_result_file_name = get_temp_file_path()
364
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
365
376
 
366
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
377
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
367
378
  statement_params = telemetry.get_function_usage_statement_params(
368
379
  project=_PROJECT,
369
380
  subproject=_SUBPROJECT,
@@ -389,6 +400,7 @@ class Lars(BaseTransformer):
389
400
  replace=True,
390
401
  session=session,
391
402
  statement_params=statement_params,
403
+ anonymous=True
392
404
  )
393
405
  def fit_wrapper_sproc(
394
406
  session: Session,
@@ -397,7 +409,8 @@ class Lars(BaseTransformer):
397
409
  stage_result_file_name: str,
398
410
  input_cols: List[str],
399
411
  label_cols: List[str],
400
- sample_weight_col: Optional[str]
412
+ sample_weight_col: Optional[str],
413
+ statement_params: Dict[str, str]
401
414
  ) -> str:
402
415
  import cloudpickle as cp
403
416
  import numpy as np
@@ -464,15 +477,15 @@ class Lars(BaseTransformer):
464
477
  api_calls=[Session.call],
465
478
  custom_tags=dict([("autogen", True)]),
466
479
  )
467
- sproc_export_file_name = session.call(
468
- fit_sproc_name,
480
+ sproc_export_file_name = fit_wrapper_sproc(
481
+ session,
469
482
  query,
470
483
  stage_transform_file_name,
471
484
  stage_result_file_name,
472
485
  identifier.get_unescaped_names(self.input_cols),
473
486
  identifier.get_unescaped_names(self.label_cols),
474
487
  identifier.get_unescaped_names(self.sample_weight_col),
475
- statement_params=statement_params,
488
+ statement_params,
476
489
  )
477
490
 
478
491
  if "|" in sproc_export_file_name:
@@ -482,7 +495,7 @@ class Lars(BaseTransformer):
482
495
  print("\n".join(fields[1:]))
483
496
 
484
497
  session.file.get(
485
- os.path.join(stage_result_file_name, sproc_export_file_name),
498
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
486
499
  local_result_file_name,
487
500
  statement_params=statement_params
488
501
  )
@@ -528,7 +541,7 @@ class Lars(BaseTransformer):
528
541
 
529
542
  # Register vectorized UDF for batch inference
530
543
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
531
- safe_id=self.id, method=inference_method)
544
+ safe_id=self._get_rand_id(), method=inference_method)
532
545
 
533
546
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
534
547
  # will try to pickle all of self which fails.
@@ -620,7 +633,7 @@ class Lars(BaseTransformer):
620
633
  return transformed_pandas_df.to_dict("records")
621
634
 
622
635
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
623
- safe_id=self.id
636
+ safe_id=self._get_rand_id()
624
637
  )
625
638
 
626
639
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -676,26 +689,37 @@ class Lars(BaseTransformer):
676
689
  # input cols need to match unquoted / quoted
677
690
  input_cols = self.input_cols
678
691
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
692
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
679
693
 
680
694
  estimator = self._sklearn_object
681
695
 
682
- input_df = dataset[input_cols] # Select input columns with quoted column names.
683
- if hasattr(estimator, "feature_names_in_"):
684
- missing_features = []
685
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
686
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
687
- missing_features.append(f)
688
-
689
- if len(missing_features) > 0:
690
- raise ValueError(
691
- "The feature names should match with those that were passed during fit.\n"
692
- f"Features seen during fit call but not present in the input: {missing_features}\n"
693
- f"Features in the input dataframe : {input_cols}\n"
694
- )
695
- input_df.columns = getattr(estimator, "feature_names_in_")
696
- else:
697
- # Just rename the column names to unquoted identifiers.
698
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
696
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
697
+ missing_features = []
698
+ features_in_dataset = set(dataset.columns)
699
+ columns_to_select = []
700
+ for i, f in enumerate(features_required_by_estimator):
701
+ if (
702
+ i >= len(input_cols)
703
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
704
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
705
+ and quoted_input_cols[i] not in features_in_dataset)
706
+ ):
707
+ missing_features.append(f)
708
+ elif input_cols[i] in features_in_dataset:
709
+ columns_to_select.append(input_cols[i])
710
+ elif unquoted_input_cols[i] in features_in_dataset:
711
+ columns_to_select.append(unquoted_input_cols[i])
712
+ else:
713
+ columns_to_select.append(quoted_input_cols[i])
714
+
715
+ if len(missing_features) > 0:
716
+ raise ValueError(
717
+ "The feature names should match with those that were passed during fit.\n"
718
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
719
+ f"Features in the input dataframe : {input_cols}\n"
720
+ )
721
+ input_df = dataset[columns_to_select]
722
+ input_df.columns = features_required_by_estimator
699
723
 
700
724
  transformed_numpy_array = getattr(estimator, inference_method)(
701
725
  input_df
@@ -776,11 +800,18 @@ class Lars(BaseTransformer):
776
800
  Transformed dataset.
777
801
  """
778
802
  if isinstance(dataset, DataFrame):
803
+ expected_type_inferred = "float"
804
+ # when it is classifier, infer the datatype from label columns
805
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
806
+ expected_type_inferred = convert_sp_to_sf_type(
807
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
808
+ )
809
+
779
810
  output_df = self._batch_inference(
780
811
  dataset=dataset,
781
812
  inference_method="predict",
782
813
  expected_output_cols_list=self.output_cols,
783
- expected_output_cols_type="float",
814
+ expected_output_cols_type=expected_type_inferred,
784
815
  )
785
816
  elif isinstance(dataset, pd.DataFrame):
786
817
  output_df = self._sklearn_inference(
@@ -851,10 +882,10 @@ class Lars(BaseTransformer):
851
882
 
852
883
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
853
884
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
854
- Returns an empty list if current object is not a classifier or not yet fitted.
885
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
855
886
  """
856
887
  if getattr(self._sklearn_object, "classes_", None) is None:
857
- return []
888
+ return [output_cols_prefix]
858
889
 
859
890
  classes = self._sklearn_object.classes_
860
891
  if isinstance(classes, numpy.ndarray):
@@ -1079,7 +1110,7 @@ class Lars(BaseTransformer):
1079
1110
  cp.dump(self._sklearn_object, local_score_file)
1080
1111
 
1081
1112
  # Create temp stage to run score.
1082
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1113
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1083
1114
  session = dataset._session
1084
1115
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1085
1116
  SqlResultValidator(
@@ -1093,8 +1124,9 @@ class Lars(BaseTransformer):
1093
1124
  expected_value=f"Stage area {score_stage_name} successfully created."
1094
1125
  ).validate()
1095
1126
 
1096
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1097
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1127
+ # Use posixpath to construct stage paths
1128
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1129
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1098
1130
  statement_params = telemetry.get_function_usage_statement_params(
1099
1131
  project=_PROJECT,
1100
1132
  subproject=_SUBPROJECT,
@@ -1120,6 +1152,7 @@ class Lars(BaseTransformer):
1120
1152
  replace=True,
1121
1153
  session=session,
1122
1154
  statement_params=statement_params,
1155
+ anonymous=True
1123
1156
  )
1124
1157
  def score_wrapper_sproc(
1125
1158
  session: Session,
@@ -1127,7 +1160,8 @@ class Lars(BaseTransformer):
1127
1160
  stage_score_file_name: str,
1128
1161
  input_cols: List[str],
1129
1162
  label_cols: List[str],
1130
- sample_weight_col: Optional[str]
1163
+ sample_weight_col: Optional[str],
1164
+ statement_params: Dict[str, str]
1131
1165
  ) -> float:
1132
1166
  import cloudpickle as cp
1133
1167
  import numpy as np
@@ -1177,14 +1211,14 @@ class Lars(BaseTransformer):
1177
1211
  api_calls=[Session.call],
1178
1212
  custom_tags=dict([("autogen", True)]),
1179
1213
  )
1180
- score = session.call(
1181
- score_sproc_name,
1214
+ score = score_wrapper_sproc(
1215
+ session,
1182
1216
  query,
1183
1217
  stage_score_file_name,
1184
1218
  identifier.get_unescaped_names(self.input_cols),
1185
1219
  identifier.get_unescaped_names(self.label_cols),
1186
1220
  identifier.get_unescaped_names(self.sample_weight_col),
1187
- statement_params=statement_params,
1221
+ statement_params,
1188
1222
  )
1189
1223
 
1190
1224
  cleanup_temp_files([local_score_file_name])
@@ -1202,18 +1236,20 @@ class Lars(BaseTransformer):
1202
1236
  if self._sklearn_object._estimator_type == 'classifier':
1203
1237
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1204
1238
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1205
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1239
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1240
+ ([] if self._drop_input_cols else inputs) + outputs)
1206
1241
  # For regressor, the type of predict is float64
1207
1242
  elif self._sklearn_object._estimator_type == 'regressor':
1208
1243
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1209
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1210
-
1244
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1245
+ ([] if self._drop_input_cols else inputs) + outputs)
1211
1246
  for prob_func in PROB_FUNCTIONS:
1212
1247
  if hasattr(self, prob_func):
1213
1248
  output_cols_prefix: str = f"{prob_func}_"
1214
1249
  output_column_names = self._get_output_column_names(output_cols_prefix)
1215
1250
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1216
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1251
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1252
+ ([] if self._drop_input_cols else inputs) + outputs)
1217
1253
 
1218
1254
  @property
1219
1255
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -247,7 +249,6 @@ class LarsCV(BaseTransformer):
247
249
  sample_weight_col: Optional[str] = None,
248
250
  ) -> None:
249
251
  super().__init__()
250
- self.id = str(uuid4()).replace("-", "_").upper()
251
252
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
252
253
 
253
254
  self._deps = list(deps)
@@ -276,6 +277,15 @@ class LarsCV(BaseTransformer):
276
277
  self.set_drop_input_cols(drop_input_cols)
277
278
  self.set_sample_weight_col(sample_weight_col)
278
279
 
280
+ def _get_rand_id(self) -> str:
281
+ """
282
+ Generate random id to be used in sproc and stage names.
283
+
284
+ Returns:
285
+ Random id string usable in sproc, table, and stage names.
286
+ """
287
+ return str(uuid4()).replace("-", "_").upper()
288
+
279
289
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
280
290
  """
281
291
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -354,7 +364,7 @@ class LarsCV(BaseTransformer):
354
364
  cp.dump(self._sklearn_object, local_transform_file)
355
365
 
356
366
  # Create temp stage to run fit.
357
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
367
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
358
368
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
359
369
  SqlResultValidator(
360
370
  session=session,
@@ -367,11 +377,12 @@ class LarsCV(BaseTransformer):
367
377
  expected_value=f"Stage area {transform_stage_name} successfully created."
368
378
  ).validate()
369
379
 
370
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
380
+ # Use posixpath to construct stage paths
381
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
382
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
371
383
  local_result_file_name = get_temp_file_path()
372
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
373
384
 
374
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
385
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
375
386
  statement_params = telemetry.get_function_usage_statement_params(
376
387
  project=_PROJECT,
377
388
  subproject=_SUBPROJECT,
@@ -397,6 +408,7 @@ class LarsCV(BaseTransformer):
397
408
  replace=True,
398
409
  session=session,
399
410
  statement_params=statement_params,
411
+ anonymous=True
400
412
  )
401
413
  def fit_wrapper_sproc(
402
414
  session: Session,
@@ -405,7 +417,8 @@ class LarsCV(BaseTransformer):
405
417
  stage_result_file_name: str,
406
418
  input_cols: List[str],
407
419
  label_cols: List[str],
408
- sample_weight_col: Optional[str]
420
+ sample_weight_col: Optional[str],
421
+ statement_params: Dict[str, str]
409
422
  ) -> str:
410
423
  import cloudpickle as cp
411
424
  import numpy as np
@@ -472,15 +485,15 @@ class LarsCV(BaseTransformer):
472
485
  api_calls=[Session.call],
473
486
  custom_tags=dict([("autogen", True)]),
474
487
  )
475
- sproc_export_file_name = session.call(
476
- fit_sproc_name,
488
+ sproc_export_file_name = fit_wrapper_sproc(
489
+ session,
477
490
  query,
478
491
  stage_transform_file_name,
479
492
  stage_result_file_name,
480
493
  identifier.get_unescaped_names(self.input_cols),
481
494
  identifier.get_unescaped_names(self.label_cols),
482
495
  identifier.get_unescaped_names(self.sample_weight_col),
483
- statement_params=statement_params,
496
+ statement_params,
484
497
  )
485
498
 
486
499
  if "|" in sproc_export_file_name:
@@ -490,7 +503,7 @@ class LarsCV(BaseTransformer):
490
503
  print("\n".join(fields[1:]))
491
504
 
492
505
  session.file.get(
493
- os.path.join(stage_result_file_name, sproc_export_file_name),
506
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
494
507
  local_result_file_name,
495
508
  statement_params=statement_params
496
509
  )
@@ -536,7 +549,7 @@ class LarsCV(BaseTransformer):
536
549
 
537
550
  # Register vectorized UDF for batch inference
538
551
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
539
- safe_id=self.id, method=inference_method)
552
+ safe_id=self._get_rand_id(), method=inference_method)
540
553
 
541
554
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
542
555
  # will try to pickle all of self which fails.
@@ -628,7 +641,7 @@ class LarsCV(BaseTransformer):
628
641
  return transformed_pandas_df.to_dict("records")
629
642
 
630
643
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
631
- safe_id=self.id
644
+ safe_id=self._get_rand_id()
632
645
  )
633
646
 
634
647
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -684,26 +697,37 @@ class LarsCV(BaseTransformer):
684
697
  # input cols need to match unquoted / quoted
685
698
  input_cols = self.input_cols
686
699
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
700
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
687
701
 
688
702
  estimator = self._sklearn_object
689
703
 
690
- input_df = dataset[input_cols] # Select input columns with quoted column names.
691
- if hasattr(estimator, "feature_names_in_"):
692
- missing_features = []
693
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
694
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
695
- missing_features.append(f)
696
-
697
- if len(missing_features) > 0:
698
- raise ValueError(
699
- "The feature names should match with those that were passed during fit.\n"
700
- f"Features seen during fit call but not present in the input: {missing_features}\n"
701
- f"Features in the input dataframe : {input_cols}\n"
702
- )
703
- input_df.columns = getattr(estimator, "feature_names_in_")
704
- else:
705
- # Just rename the column names to unquoted identifiers.
706
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
704
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
705
+ missing_features = []
706
+ features_in_dataset = set(dataset.columns)
707
+ columns_to_select = []
708
+ for i, f in enumerate(features_required_by_estimator):
709
+ if (
710
+ i >= len(input_cols)
711
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
712
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
713
+ and quoted_input_cols[i] not in features_in_dataset)
714
+ ):
715
+ missing_features.append(f)
716
+ elif input_cols[i] in features_in_dataset:
717
+ columns_to_select.append(input_cols[i])
718
+ elif unquoted_input_cols[i] in features_in_dataset:
719
+ columns_to_select.append(unquoted_input_cols[i])
720
+ else:
721
+ columns_to_select.append(quoted_input_cols[i])
722
+
723
+ if len(missing_features) > 0:
724
+ raise ValueError(
725
+ "The feature names should match with those that were passed during fit.\n"
726
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
727
+ f"Features in the input dataframe : {input_cols}\n"
728
+ )
729
+ input_df = dataset[columns_to_select]
730
+ input_df.columns = features_required_by_estimator
707
731
 
708
732
  transformed_numpy_array = getattr(estimator, inference_method)(
709
733
  input_df
@@ -784,11 +808,18 @@ class LarsCV(BaseTransformer):
784
808
  Transformed dataset.
785
809
  """
786
810
  if isinstance(dataset, DataFrame):
811
+ expected_type_inferred = "float"
812
+ # when it is classifier, infer the datatype from label columns
813
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
814
+ expected_type_inferred = convert_sp_to_sf_type(
815
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
816
+ )
817
+
787
818
  output_df = self._batch_inference(
788
819
  dataset=dataset,
789
820
  inference_method="predict",
790
821
  expected_output_cols_list=self.output_cols,
791
- expected_output_cols_type="float",
822
+ expected_output_cols_type=expected_type_inferred,
792
823
  )
793
824
  elif isinstance(dataset, pd.DataFrame):
794
825
  output_df = self._sklearn_inference(
@@ -859,10 +890,10 @@ class LarsCV(BaseTransformer):
859
890
 
860
891
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
861
892
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
862
- Returns an empty list if current object is not a classifier or not yet fitted.
893
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
863
894
  """
864
895
  if getattr(self._sklearn_object, "classes_", None) is None:
865
- return []
896
+ return [output_cols_prefix]
866
897
 
867
898
  classes = self._sklearn_object.classes_
868
899
  if isinstance(classes, numpy.ndarray):
@@ -1087,7 +1118,7 @@ class LarsCV(BaseTransformer):
1087
1118
  cp.dump(self._sklearn_object, local_score_file)
1088
1119
 
1089
1120
  # Create temp stage to run score.
1090
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1121
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1091
1122
  session = dataset._session
1092
1123
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1093
1124
  SqlResultValidator(
@@ -1101,8 +1132,9 @@ class LarsCV(BaseTransformer):
1101
1132
  expected_value=f"Stage area {score_stage_name} successfully created."
1102
1133
  ).validate()
1103
1134
 
1104
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1105
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1135
+ # Use posixpath to construct stage paths
1136
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1137
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1106
1138
  statement_params = telemetry.get_function_usage_statement_params(
1107
1139
  project=_PROJECT,
1108
1140
  subproject=_SUBPROJECT,
@@ -1128,6 +1160,7 @@ class LarsCV(BaseTransformer):
1128
1160
  replace=True,
1129
1161
  session=session,
1130
1162
  statement_params=statement_params,
1163
+ anonymous=True
1131
1164
  )
1132
1165
  def score_wrapper_sproc(
1133
1166
  session: Session,
@@ -1135,7 +1168,8 @@ class LarsCV(BaseTransformer):
1135
1168
  stage_score_file_name: str,
1136
1169
  input_cols: List[str],
1137
1170
  label_cols: List[str],
1138
- sample_weight_col: Optional[str]
1171
+ sample_weight_col: Optional[str],
1172
+ statement_params: Dict[str, str]
1139
1173
  ) -> float:
1140
1174
  import cloudpickle as cp
1141
1175
  import numpy as np
@@ -1185,14 +1219,14 @@ class LarsCV(BaseTransformer):
1185
1219
  api_calls=[Session.call],
1186
1220
  custom_tags=dict([("autogen", True)]),
1187
1221
  )
1188
- score = session.call(
1189
- score_sproc_name,
1222
+ score = score_wrapper_sproc(
1223
+ session,
1190
1224
  query,
1191
1225
  stage_score_file_name,
1192
1226
  identifier.get_unescaped_names(self.input_cols),
1193
1227
  identifier.get_unescaped_names(self.label_cols),
1194
1228
  identifier.get_unescaped_names(self.sample_weight_col),
1195
- statement_params=statement_params,
1229
+ statement_params,
1196
1230
  )
1197
1231
 
1198
1232
  cleanup_temp_files([local_score_file_name])
@@ -1210,18 +1244,20 @@ class LarsCV(BaseTransformer):
1210
1244
  if self._sklearn_object._estimator_type == 'classifier':
1211
1245
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1212
1246
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1213
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1247
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1248
+ ([] if self._drop_input_cols else inputs) + outputs)
1214
1249
  # For regressor, the type of predict is float64
1215
1250
  elif self._sklearn_object._estimator_type == 'regressor':
1216
1251
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1217
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1218
-
1252
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1253
+ ([] if self._drop_input_cols else inputs) + outputs)
1219
1254
  for prob_func in PROB_FUNCTIONS:
1220
1255
  if hasattr(self, prob_func):
1221
1256
  output_cols_prefix: str = f"{prob_func}_"
1222
1257
  output_column_names = self._get_output_column_names(output_cols_prefix)
1223
1258
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1224
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1259
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1260
+ ([] if self._drop_input_cols else inputs) + outputs)
1225
1261
 
1226
1262
  @property
1227
1263
  def model_signatures(self) -> Dict[str, ModelSignature]: