snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -249,7 +251,6 @@ class GraphicalLassoCV(BaseTransformer):
249
251
  sample_weight_col: Optional[str] = None,
250
252
  ) -> None:
251
253
  super().__init__()
252
- self.id = str(uuid4()).replace("-", "_").upper()
253
254
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
254
255
 
255
256
  self._deps = list(deps)
@@ -278,6 +279,15 @@ class GraphicalLassoCV(BaseTransformer):
278
279
  self.set_drop_input_cols(drop_input_cols)
279
280
  self.set_sample_weight_col(sample_weight_col)
280
281
 
282
+ def _get_rand_id(self) -> str:
283
+ """
284
+ Generate random id to be used in sproc and stage names.
285
+
286
+ Returns:
287
+ Random id string usable in sproc, table, and stage names.
288
+ """
289
+ return str(uuid4()).replace("-", "_").upper()
290
+
281
291
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
282
292
  """
283
293
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -356,7 +366,7 @@ class GraphicalLassoCV(BaseTransformer):
356
366
  cp.dump(self._sklearn_object, local_transform_file)
357
367
 
358
368
  # Create temp stage to run fit.
359
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
369
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
360
370
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
361
371
  SqlResultValidator(
362
372
  session=session,
@@ -369,11 +379,12 @@ class GraphicalLassoCV(BaseTransformer):
369
379
  expected_value=f"Stage area {transform_stage_name} successfully created."
370
380
  ).validate()
371
381
 
372
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
382
+ # Use posixpath to construct stage paths
383
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
384
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
373
385
  local_result_file_name = get_temp_file_path()
374
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
375
386
 
376
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
387
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
377
388
  statement_params = telemetry.get_function_usage_statement_params(
378
389
  project=_PROJECT,
379
390
  subproject=_SUBPROJECT,
@@ -399,6 +410,7 @@ class GraphicalLassoCV(BaseTransformer):
399
410
  replace=True,
400
411
  session=session,
401
412
  statement_params=statement_params,
413
+ anonymous=True
402
414
  )
403
415
  def fit_wrapper_sproc(
404
416
  session: Session,
@@ -407,7 +419,8 @@ class GraphicalLassoCV(BaseTransformer):
407
419
  stage_result_file_name: str,
408
420
  input_cols: List[str],
409
421
  label_cols: List[str],
410
- sample_weight_col: Optional[str]
422
+ sample_weight_col: Optional[str],
423
+ statement_params: Dict[str, str]
411
424
  ) -> str:
412
425
  import cloudpickle as cp
413
426
  import numpy as np
@@ -474,15 +487,15 @@ class GraphicalLassoCV(BaseTransformer):
474
487
  api_calls=[Session.call],
475
488
  custom_tags=dict([("autogen", True)]),
476
489
  )
477
- sproc_export_file_name = session.call(
478
- fit_sproc_name,
490
+ sproc_export_file_name = fit_wrapper_sproc(
491
+ session,
479
492
  query,
480
493
  stage_transform_file_name,
481
494
  stage_result_file_name,
482
495
  identifier.get_unescaped_names(self.input_cols),
483
496
  identifier.get_unescaped_names(self.label_cols),
484
497
  identifier.get_unescaped_names(self.sample_weight_col),
485
- statement_params=statement_params,
498
+ statement_params,
486
499
  )
487
500
 
488
501
  if "|" in sproc_export_file_name:
@@ -492,7 +505,7 @@ class GraphicalLassoCV(BaseTransformer):
492
505
  print("\n".join(fields[1:]))
493
506
 
494
507
  session.file.get(
495
- os.path.join(stage_result_file_name, sproc_export_file_name),
508
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
496
509
  local_result_file_name,
497
510
  statement_params=statement_params
498
511
  )
@@ -538,7 +551,7 @@ class GraphicalLassoCV(BaseTransformer):
538
551
 
539
552
  # Register vectorized UDF for batch inference
540
553
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
541
- safe_id=self.id, method=inference_method)
554
+ safe_id=self._get_rand_id(), method=inference_method)
542
555
 
543
556
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
544
557
  # will try to pickle all of self which fails.
@@ -630,7 +643,7 @@ class GraphicalLassoCV(BaseTransformer):
630
643
  return transformed_pandas_df.to_dict("records")
631
644
 
632
645
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
633
- safe_id=self.id
646
+ safe_id=self._get_rand_id()
634
647
  )
635
648
 
636
649
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -686,26 +699,37 @@ class GraphicalLassoCV(BaseTransformer):
686
699
  # input cols need to match unquoted / quoted
687
700
  input_cols = self.input_cols
688
701
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
702
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
689
703
 
690
704
  estimator = self._sklearn_object
691
705
 
692
- input_df = dataset[input_cols] # Select input columns with quoted column names.
693
- if hasattr(estimator, "feature_names_in_"):
694
- missing_features = []
695
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
696
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
697
- missing_features.append(f)
698
-
699
- if len(missing_features) > 0:
700
- raise ValueError(
701
- "The feature names should match with those that were passed during fit.\n"
702
- f"Features seen during fit call but not present in the input: {missing_features}\n"
703
- f"Features in the input dataframe : {input_cols}\n"
704
- )
705
- input_df.columns = getattr(estimator, "feature_names_in_")
706
- else:
707
- # Just rename the column names to unquoted identifiers.
708
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
706
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
707
+ missing_features = []
708
+ features_in_dataset = set(dataset.columns)
709
+ columns_to_select = []
710
+ for i, f in enumerate(features_required_by_estimator):
711
+ if (
712
+ i >= len(input_cols)
713
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
714
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
715
+ and quoted_input_cols[i] not in features_in_dataset)
716
+ ):
717
+ missing_features.append(f)
718
+ elif input_cols[i] in features_in_dataset:
719
+ columns_to_select.append(input_cols[i])
720
+ elif unquoted_input_cols[i] in features_in_dataset:
721
+ columns_to_select.append(unquoted_input_cols[i])
722
+ else:
723
+ columns_to_select.append(quoted_input_cols[i])
724
+
725
+ if len(missing_features) > 0:
726
+ raise ValueError(
727
+ "The feature names should match with those that were passed during fit.\n"
728
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
729
+ f"Features in the input dataframe : {input_cols}\n"
730
+ )
731
+ input_df = dataset[columns_to_select]
732
+ input_df.columns = features_required_by_estimator
709
733
 
710
734
  transformed_numpy_array = getattr(estimator, inference_method)(
711
735
  input_df
@@ -784,11 +808,18 @@ class GraphicalLassoCV(BaseTransformer):
784
808
  Transformed dataset.
785
809
  """
786
810
  if isinstance(dataset, DataFrame):
811
+ expected_type_inferred = ""
812
+ # when it is classifier, infer the datatype from label columns
813
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
814
+ expected_type_inferred = convert_sp_to_sf_type(
815
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
816
+ )
817
+
787
818
  output_df = self._batch_inference(
788
819
  dataset=dataset,
789
820
  inference_method="predict",
790
821
  expected_output_cols_list=self.output_cols,
791
- expected_output_cols_type="",
822
+ expected_output_cols_type=expected_type_inferred,
792
823
  )
793
824
  elif isinstance(dataset, pd.DataFrame):
794
825
  output_df = self._sklearn_inference(
@@ -859,10 +890,10 @@ class GraphicalLassoCV(BaseTransformer):
859
890
 
860
891
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
861
892
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
862
- Returns an empty list if current object is not a classifier or not yet fitted.
893
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
863
894
  """
864
895
  if getattr(self._sklearn_object, "classes_", None) is None:
865
- return []
896
+ return [output_cols_prefix]
866
897
 
867
898
  classes = self._sklearn_object.classes_
868
899
  if isinstance(classes, numpy.ndarray):
@@ -1087,7 +1118,7 @@ class GraphicalLassoCV(BaseTransformer):
1087
1118
  cp.dump(self._sklearn_object, local_score_file)
1088
1119
 
1089
1120
  # Create temp stage to run score.
1090
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1121
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1091
1122
  session = dataset._session
1092
1123
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1093
1124
  SqlResultValidator(
@@ -1101,8 +1132,9 @@ class GraphicalLassoCV(BaseTransformer):
1101
1132
  expected_value=f"Stage area {score_stage_name} successfully created."
1102
1133
  ).validate()
1103
1134
 
1104
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1105
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1135
+ # Use posixpath to construct stage paths
1136
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1137
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1106
1138
  statement_params = telemetry.get_function_usage_statement_params(
1107
1139
  project=_PROJECT,
1108
1140
  subproject=_SUBPROJECT,
@@ -1128,6 +1160,7 @@ class GraphicalLassoCV(BaseTransformer):
1128
1160
  replace=True,
1129
1161
  session=session,
1130
1162
  statement_params=statement_params,
1163
+ anonymous=True
1131
1164
  )
1132
1165
  def score_wrapper_sproc(
1133
1166
  session: Session,
@@ -1135,7 +1168,8 @@ class GraphicalLassoCV(BaseTransformer):
1135
1168
  stage_score_file_name: str,
1136
1169
  input_cols: List[str],
1137
1170
  label_cols: List[str],
1138
- sample_weight_col: Optional[str]
1171
+ sample_weight_col: Optional[str],
1172
+ statement_params: Dict[str, str]
1139
1173
  ) -> float:
1140
1174
  import cloudpickle as cp
1141
1175
  import numpy as np
@@ -1185,14 +1219,14 @@ class GraphicalLassoCV(BaseTransformer):
1185
1219
  api_calls=[Session.call],
1186
1220
  custom_tags=dict([("autogen", True)]),
1187
1221
  )
1188
- score = session.call(
1189
- score_sproc_name,
1222
+ score = score_wrapper_sproc(
1223
+ session,
1190
1224
  query,
1191
1225
  stage_score_file_name,
1192
1226
  identifier.get_unescaped_names(self.input_cols),
1193
1227
  identifier.get_unescaped_names(self.label_cols),
1194
1228
  identifier.get_unescaped_names(self.sample_weight_col),
1195
- statement_params=statement_params,
1229
+ statement_params,
1196
1230
  )
1197
1231
 
1198
1232
  cleanup_temp_files([local_score_file_name])
@@ -1210,18 +1244,20 @@ class GraphicalLassoCV(BaseTransformer):
1210
1244
  if self._sklearn_object._estimator_type == 'classifier':
1211
1245
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1212
1246
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1213
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1247
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1248
+ ([] if self._drop_input_cols else inputs) + outputs)
1214
1249
  # For regressor, the type of predict is float64
1215
1250
  elif self._sklearn_object._estimator_type == 'regressor':
1216
1251
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1217
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1218
-
1252
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1253
+ ([] if self._drop_input_cols else inputs) + outputs)
1219
1254
  for prob_func in PROB_FUNCTIONS:
1220
1255
  if hasattr(self, prob_func):
1221
1256
  output_cols_prefix: str = f"{prob_func}_"
1222
1257
  output_column_names = self._get_output_column_names(output_cols_prefix)
1223
1258
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1224
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1259
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1260
+ ([] if self._drop_input_cols else inputs) + outputs)
1225
1261
 
1226
1262
  @property
1227
1263
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -196,7 +198,6 @@ class LedoitWolf(BaseTransformer):
196
198
  sample_weight_col: Optional[str] = None,
197
199
  ) -> None:
198
200
  super().__init__()
199
- self.id = str(uuid4()).replace("-", "_").upper()
200
201
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
201
202
 
202
203
  self._deps = list(deps)
@@ -218,6 +219,15 @@ class LedoitWolf(BaseTransformer):
218
219
  self.set_drop_input_cols(drop_input_cols)
219
220
  self.set_sample_weight_col(sample_weight_col)
220
221
 
222
+ def _get_rand_id(self) -> str:
223
+ """
224
+ Generate random id to be used in sproc and stage names.
225
+
226
+ Returns:
227
+ Random id string usable in sproc, table, and stage names.
228
+ """
229
+ return str(uuid4()).replace("-", "_").upper()
230
+
221
231
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
222
232
  """
223
233
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -296,7 +306,7 @@ class LedoitWolf(BaseTransformer):
296
306
  cp.dump(self._sklearn_object, local_transform_file)
297
307
 
298
308
  # Create temp stage to run fit.
299
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
309
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
300
310
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
301
311
  SqlResultValidator(
302
312
  session=session,
@@ -309,11 +319,12 @@ class LedoitWolf(BaseTransformer):
309
319
  expected_value=f"Stage area {transform_stage_name} successfully created."
310
320
  ).validate()
311
321
 
312
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
322
+ # Use posixpath to construct stage paths
323
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
324
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
313
325
  local_result_file_name = get_temp_file_path()
314
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
315
326
 
316
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
327
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
317
328
  statement_params = telemetry.get_function_usage_statement_params(
318
329
  project=_PROJECT,
319
330
  subproject=_SUBPROJECT,
@@ -339,6 +350,7 @@ class LedoitWolf(BaseTransformer):
339
350
  replace=True,
340
351
  session=session,
341
352
  statement_params=statement_params,
353
+ anonymous=True
342
354
  )
343
355
  def fit_wrapper_sproc(
344
356
  session: Session,
@@ -347,7 +359,8 @@ class LedoitWolf(BaseTransformer):
347
359
  stage_result_file_name: str,
348
360
  input_cols: List[str],
349
361
  label_cols: List[str],
350
- sample_weight_col: Optional[str]
362
+ sample_weight_col: Optional[str],
363
+ statement_params: Dict[str, str]
351
364
  ) -> str:
352
365
  import cloudpickle as cp
353
366
  import numpy as np
@@ -414,15 +427,15 @@ class LedoitWolf(BaseTransformer):
414
427
  api_calls=[Session.call],
415
428
  custom_tags=dict([("autogen", True)]),
416
429
  )
417
- sproc_export_file_name = session.call(
418
- fit_sproc_name,
430
+ sproc_export_file_name = fit_wrapper_sproc(
431
+ session,
419
432
  query,
420
433
  stage_transform_file_name,
421
434
  stage_result_file_name,
422
435
  identifier.get_unescaped_names(self.input_cols),
423
436
  identifier.get_unescaped_names(self.label_cols),
424
437
  identifier.get_unescaped_names(self.sample_weight_col),
425
- statement_params=statement_params,
438
+ statement_params,
426
439
  )
427
440
 
428
441
  if "|" in sproc_export_file_name:
@@ -432,7 +445,7 @@ class LedoitWolf(BaseTransformer):
432
445
  print("\n".join(fields[1:]))
433
446
 
434
447
  session.file.get(
435
- os.path.join(stage_result_file_name, sproc_export_file_name),
448
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
436
449
  local_result_file_name,
437
450
  statement_params=statement_params
438
451
  )
@@ -478,7 +491,7 @@ class LedoitWolf(BaseTransformer):
478
491
 
479
492
  # Register vectorized UDF for batch inference
480
493
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
481
- safe_id=self.id, method=inference_method)
494
+ safe_id=self._get_rand_id(), method=inference_method)
482
495
 
483
496
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
484
497
  # will try to pickle all of self which fails.
@@ -570,7 +583,7 @@ class LedoitWolf(BaseTransformer):
570
583
  return transformed_pandas_df.to_dict("records")
571
584
 
572
585
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
573
- safe_id=self.id
586
+ safe_id=self._get_rand_id()
574
587
  )
575
588
 
576
589
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -626,26 +639,37 @@ class LedoitWolf(BaseTransformer):
626
639
  # input cols need to match unquoted / quoted
627
640
  input_cols = self.input_cols
628
641
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
642
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
629
643
 
630
644
  estimator = self._sklearn_object
631
645
 
632
- input_df = dataset[input_cols] # Select input columns with quoted column names.
633
- if hasattr(estimator, "feature_names_in_"):
634
- missing_features = []
635
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
636
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
637
- missing_features.append(f)
638
-
639
- if len(missing_features) > 0:
640
- raise ValueError(
641
- "The feature names should match with those that were passed during fit.\n"
642
- f"Features seen during fit call but not present in the input: {missing_features}\n"
643
- f"Features in the input dataframe : {input_cols}\n"
644
- )
645
- input_df.columns = getattr(estimator, "feature_names_in_")
646
- else:
647
- # Just rename the column names to unquoted identifiers.
648
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
646
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
647
+ missing_features = []
648
+ features_in_dataset = set(dataset.columns)
649
+ columns_to_select = []
650
+ for i, f in enumerate(features_required_by_estimator):
651
+ if (
652
+ i >= len(input_cols)
653
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
654
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
655
+ and quoted_input_cols[i] not in features_in_dataset)
656
+ ):
657
+ missing_features.append(f)
658
+ elif input_cols[i] in features_in_dataset:
659
+ columns_to_select.append(input_cols[i])
660
+ elif unquoted_input_cols[i] in features_in_dataset:
661
+ columns_to_select.append(unquoted_input_cols[i])
662
+ else:
663
+ columns_to_select.append(quoted_input_cols[i])
664
+
665
+ if len(missing_features) > 0:
666
+ raise ValueError(
667
+ "The feature names should match with those that were passed during fit.\n"
668
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
669
+ f"Features in the input dataframe : {input_cols}\n"
670
+ )
671
+ input_df = dataset[columns_to_select]
672
+ input_df.columns = features_required_by_estimator
649
673
 
650
674
  transformed_numpy_array = getattr(estimator, inference_method)(
651
675
  input_df
@@ -724,11 +748,18 @@ class LedoitWolf(BaseTransformer):
724
748
  Transformed dataset.
725
749
  """
726
750
  if isinstance(dataset, DataFrame):
751
+ expected_type_inferred = ""
752
+ # when it is classifier, infer the datatype from label columns
753
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
754
+ expected_type_inferred = convert_sp_to_sf_type(
755
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
756
+ )
757
+
727
758
  output_df = self._batch_inference(
728
759
  dataset=dataset,
729
760
  inference_method="predict",
730
761
  expected_output_cols_list=self.output_cols,
731
- expected_output_cols_type="",
762
+ expected_output_cols_type=expected_type_inferred,
732
763
  )
733
764
  elif isinstance(dataset, pd.DataFrame):
734
765
  output_df = self._sklearn_inference(
@@ -799,10 +830,10 @@ class LedoitWolf(BaseTransformer):
799
830
 
800
831
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
801
832
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
802
- Returns an empty list if current object is not a classifier or not yet fitted.
833
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
803
834
  """
804
835
  if getattr(self._sklearn_object, "classes_", None) is None:
805
- return []
836
+ return [output_cols_prefix]
806
837
 
807
838
  classes = self._sklearn_object.classes_
808
839
  if isinstance(classes, numpy.ndarray):
@@ -1027,7 +1058,7 @@ class LedoitWolf(BaseTransformer):
1027
1058
  cp.dump(self._sklearn_object, local_score_file)
1028
1059
 
1029
1060
  # Create temp stage to run score.
1030
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1061
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1031
1062
  session = dataset._session
1032
1063
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1033
1064
  SqlResultValidator(
@@ -1041,8 +1072,9 @@ class LedoitWolf(BaseTransformer):
1041
1072
  expected_value=f"Stage area {score_stage_name} successfully created."
1042
1073
  ).validate()
1043
1074
 
1044
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1045
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1075
+ # Use posixpath to construct stage paths
1076
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1077
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1046
1078
  statement_params = telemetry.get_function_usage_statement_params(
1047
1079
  project=_PROJECT,
1048
1080
  subproject=_SUBPROJECT,
@@ -1068,6 +1100,7 @@ class LedoitWolf(BaseTransformer):
1068
1100
  replace=True,
1069
1101
  session=session,
1070
1102
  statement_params=statement_params,
1103
+ anonymous=True
1071
1104
  )
1072
1105
  def score_wrapper_sproc(
1073
1106
  session: Session,
@@ -1075,7 +1108,8 @@ class LedoitWolf(BaseTransformer):
1075
1108
  stage_score_file_name: str,
1076
1109
  input_cols: List[str],
1077
1110
  label_cols: List[str],
1078
- sample_weight_col: Optional[str]
1111
+ sample_weight_col: Optional[str],
1112
+ statement_params: Dict[str, str]
1079
1113
  ) -> float:
1080
1114
  import cloudpickle as cp
1081
1115
  import numpy as np
@@ -1125,14 +1159,14 @@ class LedoitWolf(BaseTransformer):
1125
1159
  api_calls=[Session.call],
1126
1160
  custom_tags=dict([("autogen", True)]),
1127
1161
  )
1128
- score = session.call(
1129
- score_sproc_name,
1162
+ score = score_wrapper_sproc(
1163
+ session,
1130
1164
  query,
1131
1165
  stage_score_file_name,
1132
1166
  identifier.get_unescaped_names(self.input_cols),
1133
1167
  identifier.get_unescaped_names(self.label_cols),
1134
1168
  identifier.get_unescaped_names(self.sample_weight_col),
1135
- statement_params=statement_params,
1169
+ statement_params,
1136
1170
  )
1137
1171
 
1138
1172
  cleanup_temp_files([local_score_file_name])
@@ -1150,18 +1184,20 @@ class LedoitWolf(BaseTransformer):
1150
1184
  if self._sklearn_object._estimator_type == 'classifier':
1151
1185
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1152
1186
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1153
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1187
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1188
+ ([] if self._drop_input_cols else inputs) + outputs)
1154
1189
  # For regressor, the type of predict is float64
1155
1190
  elif self._sklearn_object._estimator_type == 'regressor':
1156
1191
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1157
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1158
-
1192
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1193
+ ([] if self._drop_input_cols else inputs) + outputs)
1159
1194
  for prob_func in PROB_FUNCTIONS:
1160
1195
  if hasattr(self, prob_func):
1161
1196
  output_cols_prefix: str = f"{prob_func}_"
1162
1197
  output_column_names = self._get_output_column_names(output_cols_prefix)
1163
1198
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1164
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1199
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1200
+ ([] if self._drop_input_cols else inputs) + outputs)
1165
1201
 
1166
1202
  @property
1167
1203
  def model_signatures(self) -> Dict[str, ModelSignature]: