snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -244,7 +246,6 @@ class LassoLarsIC(BaseTransformer):
244
246
  sample_weight_col: Optional[str] = None,
245
247
  ) -> None:
246
248
  super().__init__()
247
- self.id = str(uuid4()).replace("-", "_").upper()
248
249
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
249
250
 
250
251
  self._deps = list(deps)
@@ -273,6 +274,15 @@ class LassoLarsIC(BaseTransformer):
273
274
  self.set_drop_input_cols(drop_input_cols)
274
275
  self.set_sample_weight_col(sample_weight_col)
275
276
 
277
+ def _get_rand_id(self) -> str:
278
+ """
279
+ Generate random id to be used in sproc and stage names.
280
+
281
+ Returns:
282
+ Random id string usable in sproc, table, and stage names.
283
+ """
284
+ return str(uuid4()).replace("-", "_").upper()
285
+
276
286
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
277
287
  """
278
288
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -351,7 +361,7 @@ class LassoLarsIC(BaseTransformer):
351
361
  cp.dump(self._sklearn_object, local_transform_file)
352
362
 
353
363
  # Create temp stage to run fit.
354
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
364
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
355
365
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
356
366
  SqlResultValidator(
357
367
  session=session,
@@ -364,11 +374,12 @@ class LassoLarsIC(BaseTransformer):
364
374
  expected_value=f"Stage area {transform_stage_name} successfully created."
365
375
  ).validate()
366
376
 
367
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
377
+ # Use posixpath to construct stage paths
378
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
379
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
368
380
  local_result_file_name = get_temp_file_path()
369
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
370
381
 
371
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
382
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
372
383
  statement_params = telemetry.get_function_usage_statement_params(
373
384
  project=_PROJECT,
374
385
  subproject=_SUBPROJECT,
@@ -394,6 +405,7 @@ class LassoLarsIC(BaseTransformer):
394
405
  replace=True,
395
406
  session=session,
396
407
  statement_params=statement_params,
408
+ anonymous=True
397
409
  )
398
410
  def fit_wrapper_sproc(
399
411
  session: Session,
@@ -402,7 +414,8 @@ class LassoLarsIC(BaseTransformer):
402
414
  stage_result_file_name: str,
403
415
  input_cols: List[str],
404
416
  label_cols: List[str],
405
- sample_weight_col: Optional[str]
417
+ sample_weight_col: Optional[str],
418
+ statement_params: Dict[str, str]
406
419
  ) -> str:
407
420
  import cloudpickle as cp
408
421
  import numpy as np
@@ -469,15 +482,15 @@ class LassoLarsIC(BaseTransformer):
469
482
  api_calls=[Session.call],
470
483
  custom_tags=dict([("autogen", True)]),
471
484
  )
472
- sproc_export_file_name = session.call(
473
- fit_sproc_name,
485
+ sproc_export_file_name = fit_wrapper_sproc(
486
+ session,
474
487
  query,
475
488
  stage_transform_file_name,
476
489
  stage_result_file_name,
477
490
  identifier.get_unescaped_names(self.input_cols),
478
491
  identifier.get_unescaped_names(self.label_cols),
479
492
  identifier.get_unescaped_names(self.sample_weight_col),
480
- statement_params=statement_params,
493
+ statement_params,
481
494
  )
482
495
 
483
496
  if "|" in sproc_export_file_name:
@@ -487,7 +500,7 @@ class LassoLarsIC(BaseTransformer):
487
500
  print("\n".join(fields[1:]))
488
501
 
489
502
  session.file.get(
490
- os.path.join(stage_result_file_name, sproc_export_file_name),
503
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
491
504
  local_result_file_name,
492
505
  statement_params=statement_params
493
506
  )
@@ -533,7 +546,7 @@ class LassoLarsIC(BaseTransformer):
533
546
 
534
547
  # Register vectorized UDF for batch inference
535
548
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
536
- safe_id=self.id, method=inference_method)
549
+ safe_id=self._get_rand_id(), method=inference_method)
537
550
 
538
551
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
539
552
  # will try to pickle all of self which fails.
@@ -625,7 +638,7 @@ class LassoLarsIC(BaseTransformer):
625
638
  return transformed_pandas_df.to_dict("records")
626
639
 
627
640
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
628
- safe_id=self.id
641
+ safe_id=self._get_rand_id()
629
642
  )
630
643
 
631
644
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -681,26 +694,37 @@ class LassoLarsIC(BaseTransformer):
681
694
  # input cols need to match unquoted / quoted
682
695
  input_cols = self.input_cols
683
696
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
697
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
684
698
 
685
699
  estimator = self._sklearn_object
686
700
 
687
- input_df = dataset[input_cols] # Select input columns with quoted column names.
688
- if hasattr(estimator, "feature_names_in_"):
689
- missing_features = []
690
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
691
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
692
- missing_features.append(f)
693
-
694
- if len(missing_features) > 0:
695
- raise ValueError(
696
- "The feature names should match with those that were passed during fit.\n"
697
- f"Features seen during fit call but not present in the input: {missing_features}\n"
698
- f"Features in the input dataframe : {input_cols}\n"
699
- )
700
- input_df.columns = getattr(estimator, "feature_names_in_")
701
- else:
702
- # Just rename the column names to unquoted identifiers.
703
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
701
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
702
+ missing_features = []
703
+ features_in_dataset = set(dataset.columns)
704
+ columns_to_select = []
705
+ for i, f in enumerate(features_required_by_estimator):
706
+ if (
707
+ i >= len(input_cols)
708
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
709
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
710
+ and quoted_input_cols[i] not in features_in_dataset)
711
+ ):
712
+ missing_features.append(f)
713
+ elif input_cols[i] in features_in_dataset:
714
+ columns_to_select.append(input_cols[i])
715
+ elif unquoted_input_cols[i] in features_in_dataset:
716
+ columns_to_select.append(unquoted_input_cols[i])
717
+ else:
718
+ columns_to_select.append(quoted_input_cols[i])
719
+
720
+ if len(missing_features) > 0:
721
+ raise ValueError(
722
+ "The feature names should match with those that were passed during fit.\n"
723
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
724
+ f"Features in the input dataframe : {input_cols}\n"
725
+ )
726
+ input_df = dataset[columns_to_select]
727
+ input_df.columns = features_required_by_estimator
704
728
 
705
729
  transformed_numpy_array = getattr(estimator, inference_method)(
706
730
  input_df
@@ -781,11 +805,18 @@ class LassoLarsIC(BaseTransformer):
781
805
  Transformed dataset.
782
806
  """
783
807
  if isinstance(dataset, DataFrame):
808
+ expected_type_inferred = "float"
809
+ # when it is classifier, infer the datatype from label columns
810
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
811
+ expected_type_inferred = convert_sp_to_sf_type(
812
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
813
+ )
814
+
784
815
  output_df = self._batch_inference(
785
816
  dataset=dataset,
786
817
  inference_method="predict",
787
818
  expected_output_cols_list=self.output_cols,
788
- expected_output_cols_type="float",
819
+ expected_output_cols_type=expected_type_inferred,
789
820
  )
790
821
  elif isinstance(dataset, pd.DataFrame):
791
822
  output_df = self._sklearn_inference(
@@ -856,10 +887,10 @@ class LassoLarsIC(BaseTransformer):
856
887
 
857
888
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
858
889
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
859
- Returns an empty list if current object is not a classifier or not yet fitted.
890
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
860
891
  """
861
892
  if getattr(self._sklearn_object, "classes_", None) is None:
862
- return []
893
+ return [output_cols_prefix]
863
894
 
864
895
  classes = self._sklearn_object.classes_
865
896
  if isinstance(classes, numpy.ndarray):
@@ -1084,7 +1115,7 @@ class LassoLarsIC(BaseTransformer):
1084
1115
  cp.dump(self._sklearn_object, local_score_file)
1085
1116
 
1086
1117
  # Create temp stage to run score.
1087
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1118
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1088
1119
  session = dataset._session
1089
1120
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1090
1121
  SqlResultValidator(
@@ -1098,8 +1129,9 @@ class LassoLarsIC(BaseTransformer):
1098
1129
  expected_value=f"Stage area {score_stage_name} successfully created."
1099
1130
  ).validate()
1100
1131
 
1101
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1102
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1132
+ # Use posixpath to construct stage paths
1133
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1134
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1103
1135
  statement_params = telemetry.get_function_usage_statement_params(
1104
1136
  project=_PROJECT,
1105
1137
  subproject=_SUBPROJECT,
@@ -1125,6 +1157,7 @@ class LassoLarsIC(BaseTransformer):
1125
1157
  replace=True,
1126
1158
  session=session,
1127
1159
  statement_params=statement_params,
1160
+ anonymous=True
1128
1161
  )
1129
1162
  def score_wrapper_sproc(
1130
1163
  session: Session,
@@ -1132,7 +1165,8 @@ class LassoLarsIC(BaseTransformer):
1132
1165
  stage_score_file_name: str,
1133
1166
  input_cols: List[str],
1134
1167
  label_cols: List[str],
1135
- sample_weight_col: Optional[str]
1168
+ sample_weight_col: Optional[str],
1169
+ statement_params: Dict[str, str]
1136
1170
  ) -> float:
1137
1171
  import cloudpickle as cp
1138
1172
  import numpy as np
@@ -1182,14 +1216,14 @@ class LassoLarsIC(BaseTransformer):
1182
1216
  api_calls=[Session.call],
1183
1217
  custom_tags=dict([("autogen", True)]),
1184
1218
  )
1185
- score = session.call(
1186
- score_sproc_name,
1219
+ score = score_wrapper_sproc(
1220
+ session,
1187
1221
  query,
1188
1222
  stage_score_file_name,
1189
1223
  identifier.get_unescaped_names(self.input_cols),
1190
1224
  identifier.get_unescaped_names(self.label_cols),
1191
1225
  identifier.get_unescaped_names(self.sample_weight_col),
1192
- statement_params=statement_params,
1226
+ statement_params,
1193
1227
  )
1194
1228
 
1195
1229
  cleanup_temp_files([local_score_file_name])
@@ -1207,18 +1241,20 @@ class LassoLarsIC(BaseTransformer):
1207
1241
  if self._sklearn_object._estimator_type == 'classifier':
1208
1242
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1209
1243
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1210
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1244
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1245
+ ([] if self._drop_input_cols else inputs) + outputs)
1211
1246
  # For regressor, the type of predict is float64
1212
1247
  elif self._sklearn_object._estimator_type == 'regressor':
1213
1248
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1214
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1215
-
1249
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1250
+ ([] if self._drop_input_cols else inputs) + outputs)
1216
1251
  for prob_func in PROB_FUNCTIONS:
1217
1252
  if hasattr(self, prob_func):
1218
1253
  output_cols_prefix: str = f"{prob_func}_"
1219
1254
  output_column_names = self._get_output_column_names(output_cols_prefix)
1220
1255
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1221
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1256
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1257
+ ([] if self._drop_input_cols else inputs) + outputs)
1222
1258
 
1223
1259
  @property
1224
1260
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -203,7 +205,6 @@ class LinearRegression(BaseTransformer):
203
205
  sample_weight_col: Optional[str] = None,
204
206
  ) -> None:
205
207
  super().__init__()
206
- self.id = str(uuid4()).replace("-", "_").upper()
207
208
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
208
209
 
209
210
  self._deps = list(deps)
@@ -226,6 +227,15 @@ class LinearRegression(BaseTransformer):
226
227
  self.set_drop_input_cols(drop_input_cols)
227
228
  self.set_sample_weight_col(sample_weight_col)
228
229
 
230
+ def _get_rand_id(self) -> str:
231
+ """
232
+ Generate random id to be used in sproc and stage names.
233
+
234
+ Returns:
235
+ Random id string usable in sproc, table, and stage names.
236
+ """
237
+ return str(uuid4()).replace("-", "_").upper()
238
+
229
239
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
230
240
  """
231
241
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -304,7 +314,7 @@ class LinearRegression(BaseTransformer):
304
314
  cp.dump(self._sklearn_object, local_transform_file)
305
315
 
306
316
  # Create temp stage to run fit.
307
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
317
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
308
318
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
309
319
  SqlResultValidator(
310
320
  session=session,
@@ -317,11 +327,12 @@ class LinearRegression(BaseTransformer):
317
327
  expected_value=f"Stage area {transform_stage_name} successfully created."
318
328
  ).validate()
319
329
 
320
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
330
+ # Use posixpath to construct stage paths
331
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
332
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
321
333
  local_result_file_name = get_temp_file_path()
322
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
323
334
 
324
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
335
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
325
336
  statement_params = telemetry.get_function_usage_statement_params(
326
337
  project=_PROJECT,
327
338
  subproject=_SUBPROJECT,
@@ -347,6 +358,7 @@ class LinearRegression(BaseTransformer):
347
358
  replace=True,
348
359
  session=session,
349
360
  statement_params=statement_params,
361
+ anonymous=True
350
362
  )
351
363
  def fit_wrapper_sproc(
352
364
  session: Session,
@@ -355,7 +367,8 @@ class LinearRegression(BaseTransformer):
355
367
  stage_result_file_name: str,
356
368
  input_cols: List[str],
357
369
  label_cols: List[str],
358
- sample_weight_col: Optional[str]
370
+ sample_weight_col: Optional[str],
371
+ statement_params: Dict[str, str]
359
372
  ) -> str:
360
373
  import cloudpickle as cp
361
374
  import numpy as np
@@ -422,15 +435,15 @@ class LinearRegression(BaseTransformer):
422
435
  api_calls=[Session.call],
423
436
  custom_tags=dict([("autogen", True)]),
424
437
  )
425
- sproc_export_file_name = session.call(
426
- fit_sproc_name,
438
+ sproc_export_file_name = fit_wrapper_sproc(
439
+ session,
427
440
  query,
428
441
  stage_transform_file_name,
429
442
  stage_result_file_name,
430
443
  identifier.get_unescaped_names(self.input_cols),
431
444
  identifier.get_unescaped_names(self.label_cols),
432
445
  identifier.get_unescaped_names(self.sample_weight_col),
433
- statement_params=statement_params,
446
+ statement_params,
434
447
  )
435
448
 
436
449
  if "|" in sproc_export_file_name:
@@ -440,7 +453,7 @@ class LinearRegression(BaseTransformer):
440
453
  print("\n".join(fields[1:]))
441
454
 
442
455
  session.file.get(
443
- os.path.join(stage_result_file_name, sproc_export_file_name),
456
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
444
457
  local_result_file_name,
445
458
  statement_params=statement_params
446
459
  )
@@ -486,7 +499,7 @@ class LinearRegression(BaseTransformer):
486
499
 
487
500
  # Register vectorized UDF for batch inference
488
501
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
489
- safe_id=self.id, method=inference_method)
502
+ safe_id=self._get_rand_id(), method=inference_method)
490
503
 
491
504
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
492
505
  # will try to pickle all of self which fails.
@@ -578,7 +591,7 @@ class LinearRegression(BaseTransformer):
578
591
  return transformed_pandas_df.to_dict("records")
579
592
 
580
593
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
581
- safe_id=self.id
594
+ safe_id=self._get_rand_id()
582
595
  )
583
596
 
584
597
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -634,26 +647,37 @@ class LinearRegression(BaseTransformer):
634
647
  # input cols need to match unquoted / quoted
635
648
  input_cols = self.input_cols
636
649
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
650
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
637
651
 
638
652
  estimator = self._sklearn_object
639
653
 
640
- input_df = dataset[input_cols] # Select input columns with quoted column names.
641
- if hasattr(estimator, "feature_names_in_"):
642
- missing_features = []
643
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
644
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
645
- missing_features.append(f)
646
-
647
- if len(missing_features) > 0:
648
- raise ValueError(
649
- "The feature names should match with those that were passed during fit.\n"
650
- f"Features seen during fit call but not present in the input: {missing_features}\n"
651
- f"Features in the input dataframe : {input_cols}\n"
652
- )
653
- input_df.columns = getattr(estimator, "feature_names_in_")
654
- else:
655
- # Just rename the column names to unquoted identifiers.
656
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
654
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
655
+ missing_features = []
656
+ features_in_dataset = set(dataset.columns)
657
+ columns_to_select = []
658
+ for i, f in enumerate(features_required_by_estimator):
659
+ if (
660
+ i >= len(input_cols)
661
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
662
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
663
+ and quoted_input_cols[i] not in features_in_dataset)
664
+ ):
665
+ missing_features.append(f)
666
+ elif input_cols[i] in features_in_dataset:
667
+ columns_to_select.append(input_cols[i])
668
+ elif unquoted_input_cols[i] in features_in_dataset:
669
+ columns_to_select.append(unquoted_input_cols[i])
670
+ else:
671
+ columns_to_select.append(quoted_input_cols[i])
672
+
673
+ if len(missing_features) > 0:
674
+ raise ValueError(
675
+ "The feature names should match with those that were passed during fit.\n"
676
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
677
+ f"Features in the input dataframe : {input_cols}\n"
678
+ )
679
+ input_df = dataset[columns_to_select]
680
+ input_df.columns = features_required_by_estimator
657
681
 
658
682
  transformed_numpy_array = getattr(estimator, inference_method)(
659
683
  input_df
@@ -734,11 +758,18 @@ class LinearRegression(BaseTransformer):
734
758
  Transformed dataset.
735
759
  """
736
760
  if isinstance(dataset, DataFrame):
761
+ expected_type_inferred = "float"
762
+ # when it is classifier, infer the datatype from label columns
763
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
764
+ expected_type_inferred = convert_sp_to_sf_type(
765
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
766
+ )
767
+
737
768
  output_df = self._batch_inference(
738
769
  dataset=dataset,
739
770
  inference_method="predict",
740
771
  expected_output_cols_list=self.output_cols,
741
- expected_output_cols_type="float",
772
+ expected_output_cols_type=expected_type_inferred,
742
773
  )
743
774
  elif isinstance(dataset, pd.DataFrame):
744
775
  output_df = self._sklearn_inference(
@@ -809,10 +840,10 @@ class LinearRegression(BaseTransformer):
809
840
 
810
841
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
811
842
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
812
- Returns an empty list if current object is not a classifier or not yet fitted.
843
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
813
844
  """
814
845
  if getattr(self._sklearn_object, "classes_", None) is None:
815
- return []
846
+ return [output_cols_prefix]
816
847
 
817
848
  classes = self._sklearn_object.classes_
818
849
  if isinstance(classes, numpy.ndarray):
@@ -1037,7 +1068,7 @@ class LinearRegression(BaseTransformer):
1037
1068
  cp.dump(self._sklearn_object, local_score_file)
1038
1069
 
1039
1070
  # Create temp stage to run score.
1040
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1071
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1041
1072
  session = dataset._session
1042
1073
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1043
1074
  SqlResultValidator(
@@ -1051,8 +1082,9 @@ class LinearRegression(BaseTransformer):
1051
1082
  expected_value=f"Stage area {score_stage_name} successfully created."
1052
1083
  ).validate()
1053
1084
 
1054
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1055
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1085
+ # Use posixpath to construct stage paths
1086
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1087
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1056
1088
  statement_params = telemetry.get_function_usage_statement_params(
1057
1089
  project=_PROJECT,
1058
1090
  subproject=_SUBPROJECT,
@@ -1078,6 +1110,7 @@ class LinearRegression(BaseTransformer):
1078
1110
  replace=True,
1079
1111
  session=session,
1080
1112
  statement_params=statement_params,
1113
+ anonymous=True
1081
1114
  )
1082
1115
  def score_wrapper_sproc(
1083
1116
  session: Session,
@@ -1085,7 +1118,8 @@ class LinearRegression(BaseTransformer):
1085
1118
  stage_score_file_name: str,
1086
1119
  input_cols: List[str],
1087
1120
  label_cols: List[str],
1088
- sample_weight_col: Optional[str]
1121
+ sample_weight_col: Optional[str],
1122
+ statement_params: Dict[str, str]
1089
1123
  ) -> float:
1090
1124
  import cloudpickle as cp
1091
1125
  import numpy as np
@@ -1135,14 +1169,14 @@ class LinearRegression(BaseTransformer):
1135
1169
  api_calls=[Session.call],
1136
1170
  custom_tags=dict([("autogen", True)]),
1137
1171
  )
1138
- score = session.call(
1139
- score_sproc_name,
1172
+ score = score_wrapper_sproc(
1173
+ session,
1140
1174
  query,
1141
1175
  stage_score_file_name,
1142
1176
  identifier.get_unescaped_names(self.input_cols),
1143
1177
  identifier.get_unescaped_names(self.label_cols),
1144
1178
  identifier.get_unescaped_names(self.sample_weight_col),
1145
- statement_params=statement_params,
1179
+ statement_params,
1146
1180
  )
1147
1181
 
1148
1182
  cleanup_temp_files([local_score_file_name])
@@ -1160,18 +1194,20 @@ class LinearRegression(BaseTransformer):
1160
1194
  if self._sklearn_object._estimator_type == 'classifier':
1161
1195
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1162
1196
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1163
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1197
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1198
+ ([] if self._drop_input_cols else inputs) + outputs)
1164
1199
  # For regressor, the type of predict is float64
1165
1200
  elif self._sklearn_object._estimator_type == 'regressor':
1166
1201
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1167
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1168
-
1202
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1203
+ ([] if self._drop_input_cols else inputs) + outputs)
1169
1204
  for prob_func in PROB_FUNCTIONS:
1170
1205
  if hasattr(self, prob_func):
1171
1206
  output_cols_prefix: str = f"{prob_func}_"
1172
1207
  output_column_names = self._get_output_column_names(output_cols_prefix)
1173
1208
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1174
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1209
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1210
+ ([] if self._drop_input_cols else inputs) + outputs)
1175
1211
 
1176
1212
  @property
1177
1213
  def model_signatures(self) -> Dict[str, ModelSignature]: