snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -190,7 +192,6 @@ class EmpiricalCovariance(BaseTransformer):
190
192
  sample_weight_col: Optional[str] = None,
191
193
  ) -> None:
192
194
  super().__init__()
193
- self.id = str(uuid4()).replace("-", "_").upper()
194
195
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
195
196
 
196
197
  self._deps = list(deps)
@@ -211,6 +212,15 @@ class EmpiricalCovariance(BaseTransformer):
211
212
  self.set_drop_input_cols(drop_input_cols)
212
213
  self.set_sample_weight_col(sample_weight_col)
213
214
 
215
+ def _get_rand_id(self) -> str:
216
+ """
217
+ Generate random id to be used in sproc and stage names.
218
+
219
+ Returns:
220
+ Random id string usable in sproc, table, and stage names.
221
+ """
222
+ return str(uuid4()).replace("-", "_").upper()
223
+
214
224
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
215
225
  """
216
226
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -289,7 +299,7 @@ class EmpiricalCovariance(BaseTransformer):
289
299
  cp.dump(self._sklearn_object, local_transform_file)
290
300
 
291
301
  # Create temp stage to run fit.
292
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
302
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
293
303
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
294
304
  SqlResultValidator(
295
305
  session=session,
@@ -302,11 +312,12 @@ class EmpiricalCovariance(BaseTransformer):
302
312
  expected_value=f"Stage area {transform_stage_name} successfully created."
303
313
  ).validate()
304
314
 
305
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
315
+ # Use posixpath to construct stage paths
316
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
317
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
306
318
  local_result_file_name = get_temp_file_path()
307
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
308
319
 
309
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
320
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
310
321
  statement_params = telemetry.get_function_usage_statement_params(
311
322
  project=_PROJECT,
312
323
  subproject=_SUBPROJECT,
@@ -332,6 +343,7 @@ class EmpiricalCovariance(BaseTransformer):
332
343
  replace=True,
333
344
  session=session,
334
345
  statement_params=statement_params,
346
+ anonymous=True
335
347
  )
336
348
  def fit_wrapper_sproc(
337
349
  session: Session,
@@ -340,7 +352,8 @@ class EmpiricalCovariance(BaseTransformer):
340
352
  stage_result_file_name: str,
341
353
  input_cols: List[str],
342
354
  label_cols: List[str],
343
- sample_weight_col: Optional[str]
355
+ sample_weight_col: Optional[str],
356
+ statement_params: Dict[str, str]
344
357
  ) -> str:
345
358
  import cloudpickle as cp
346
359
  import numpy as np
@@ -407,15 +420,15 @@ class EmpiricalCovariance(BaseTransformer):
407
420
  api_calls=[Session.call],
408
421
  custom_tags=dict([("autogen", True)]),
409
422
  )
410
- sproc_export_file_name = session.call(
411
- fit_sproc_name,
423
+ sproc_export_file_name = fit_wrapper_sproc(
424
+ session,
412
425
  query,
413
426
  stage_transform_file_name,
414
427
  stage_result_file_name,
415
428
  identifier.get_unescaped_names(self.input_cols),
416
429
  identifier.get_unescaped_names(self.label_cols),
417
430
  identifier.get_unescaped_names(self.sample_weight_col),
418
- statement_params=statement_params,
431
+ statement_params,
419
432
  )
420
433
 
421
434
  if "|" in sproc_export_file_name:
@@ -425,7 +438,7 @@ class EmpiricalCovariance(BaseTransformer):
425
438
  print("\n".join(fields[1:]))
426
439
 
427
440
  session.file.get(
428
- os.path.join(stage_result_file_name, sproc_export_file_name),
441
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
429
442
  local_result_file_name,
430
443
  statement_params=statement_params
431
444
  )
@@ -471,7 +484,7 @@ class EmpiricalCovariance(BaseTransformer):
471
484
 
472
485
  # Register vectorized UDF for batch inference
473
486
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
474
- safe_id=self.id, method=inference_method)
487
+ safe_id=self._get_rand_id(), method=inference_method)
475
488
 
476
489
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
477
490
  # will try to pickle all of self which fails.
@@ -563,7 +576,7 @@ class EmpiricalCovariance(BaseTransformer):
563
576
  return transformed_pandas_df.to_dict("records")
564
577
 
565
578
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
566
- safe_id=self.id
579
+ safe_id=self._get_rand_id()
567
580
  )
568
581
 
569
582
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -619,26 +632,37 @@ class EmpiricalCovariance(BaseTransformer):
619
632
  # input cols need to match unquoted / quoted
620
633
  input_cols = self.input_cols
621
634
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
635
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
622
636
 
623
637
  estimator = self._sklearn_object
624
638
 
625
- input_df = dataset[input_cols] # Select input columns with quoted column names.
626
- if hasattr(estimator, "feature_names_in_"):
627
- missing_features = []
628
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
629
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
630
- missing_features.append(f)
631
-
632
- if len(missing_features) > 0:
633
- raise ValueError(
634
- "The feature names should match with those that were passed during fit.\n"
635
- f"Features seen during fit call but not present in the input: {missing_features}\n"
636
- f"Features in the input dataframe : {input_cols}\n"
637
- )
638
- input_df.columns = getattr(estimator, "feature_names_in_")
639
- else:
640
- # Just rename the column names to unquoted identifiers.
641
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
639
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
640
+ missing_features = []
641
+ features_in_dataset = set(dataset.columns)
642
+ columns_to_select = []
643
+ for i, f in enumerate(features_required_by_estimator):
644
+ if (
645
+ i >= len(input_cols)
646
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
647
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
648
+ and quoted_input_cols[i] not in features_in_dataset)
649
+ ):
650
+ missing_features.append(f)
651
+ elif input_cols[i] in features_in_dataset:
652
+ columns_to_select.append(input_cols[i])
653
+ elif unquoted_input_cols[i] in features_in_dataset:
654
+ columns_to_select.append(unquoted_input_cols[i])
655
+ else:
656
+ columns_to_select.append(quoted_input_cols[i])
657
+
658
+ if len(missing_features) > 0:
659
+ raise ValueError(
660
+ "The feature names should match with those that were passed during fit.\n"
661
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
662
+ f"Features in the input dataframe : {input_cols}\n"
663
+ )
664
+ input_df = dataset[columns_to_select]
665
+ input_df.columns = features_required_by_estimator
642
666
 
643
667
  transformed_numpy_array = getattr(estimator, inference_method)(
644
668
  input_df
@@ -717,11 +741,18 @@ class EmpiricalCovariance(BaseTransformer):
717
741
  Transformed dataset.
718
742
  """
719
743
  if isinstance(dataset, DataFrame):
744
+ expected_type_inferred = ""
745
+ # when it is classifier, infer the datatype from label columns
746
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
747
+ expected_type_inferred = convert_sp_to_sf_type(
748
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
749
+ )
750
+
720
751
  output_df = self._batch_inference(
721
752
  dataset=dataset,
722
753
  inference_method="predict",
723
754
  expected_output_cols_list=self.output_cols,
724
- expected_output_cols_type="",
755
+ expected_output_cols_type=expected_type_inferred,
725
756
  )
726
757
  elif isinstance(dataset, pd.DataFrame):
727
758
  output_df = self._sklearn_inference(
@@ -792,10 +823,10 @@ class EmpiricalCovariance(BaseTransformer):
792
823
 
793
824
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
794
825
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
795
- Returns an empty list if current object is not a classifier or not yet fitted.
826
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
796
827
  """
797
828
  if getattr(self._sklearn_object, "classes_", None) is None:
798
- return []
829
+ return [output_cols_prefix]
799
830
 
800
831
  classes = self._sklearn_object.classes_
801
832
  if isinstance(classes, numpy.ndarray):
@@ -1020,7 +1051,7 @@ class EmpiricalCovariance(BaseTransformer):
1020
1051
  cp.dump(self._sklearn_object, local_score_file)
1021
1052
 
1022
1053
  # Create temp stage to run score.
1023
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1054
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1024
1055
  session = dataset._session
1025
1056
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1026
1057
  SqlResultValidator(
@@ -1034,8 +1065,9 @@ class EmpiricalCovariance(BaseTransformer):
1034
1065
  expected_value=f"Stage area {score_stage_name} successfully created."
1035
1066
  ).validate()
1036
1067
 
1037
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1038
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1068
+ # Use posixpath to construct stage paths
1069
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1070
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1039
1071
  statement_params = telemetry.get_function_usage_statement_params(
1040
1072
  project=_PROJECT,
1041
1073
  subproject=_SUBPROJECT,
@@ -1061,6 +1093,7 @@ class EmpiricalCovariance(BaseTransformer):
1061
1093
  replace=True,
1062
1094
  session=session,
1063
1095
  statement_params=statement_params,
1096
+ anonymous=True
1064
1097
  )
1065
1098
  def score_wrapper_sproc(
1066
1099
  session: Session,
@@ -1068,7 +1101,8 @@ class EmpiricalCovariance(BaseTransformer):
1068
1101
  stage_score_file_name: str,
1069
1102
  input_cols: List[str],
1070
1103
  label_cols: List[str],
1071
- sample_weight_col: Optional[str]
1104
+ sample_weight_col: Optional[str],
1105
+ statement_params: Dict[str, str]
1072
1106
  ) -> float:
1073
1107
  import cloudpickle as cp
1074
1108
  import numpy as np
@@ -1118,14 +1152,14 @@ class EmpiricalCovariance(BaseTransformer):
1118
1152
  api_calls=[Session.call],
1119
1153
  custom_tags=dict([("autogen", True)]),
1120
1154
  )
1121
- score = session.call(
1122
- score_sproc_name,
1155
+ score = score_wrapper_sproc(
1156
+ session,
1123
1157
  query,
1124
1158
  stage_score_file_name,
1125
1159
  identifier.get_unescaped_names(self.input_cols),
1126
1160
  identifier.get_unescaped_names(self.label_cols),
1127
1161
  identifier.get_unescaped_names(self.sample_weight_col),
1128
- statement_params=statement_params,
1162
+ statement_params,
1129
1163
  )
1130
1164
 
1131
1165
  cleanup_temp_files([local_score_file_name])
@@ -1143,18 +1177,20 @@ class EmpiricalCovariance(BaseTransformer):
1143
1177
  if self._sklearn_object._estimator_type == 'classifier':
1144
1178
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1145
1179
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1146
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1180
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1181
+ ([] if self._drop_input_cols else inputs) + outputs)
1147
1182
  # For regressor, the type of predict is float64
1148
1183
  elif self._sklearn_object._estimator_type == 'regressor':
1149
1184
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1150
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1151
-
1185
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1186
+ ([] if self._drop_input_cols else inputs) + outputs)
1152
1187
  for prob_func in PROB_FUNCTIONS:
1153
1188
  if hasattr(self, prob_func):
1154
1189
  output_cols_prefix: str = f"{prob_func}_"
1155
1190
  output_column_names = self._get_output_column_names(output_cols_prefix)
1156
1191
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1157
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1192
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1193
+ ([] if self._drop_input_cols else inputs) + outputs)
1158
1194
 
1159
1195
  @property
1160
1196
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -219,7 +221,6 @@ class GraphicalLasso(BaseTransformer):
219
221
  sample_weight_col: Optional[str] = None,
220
222
  ) -> None:
221
223
  super().__init__()
222
- self.id = str(uuid4()).replace("-", "_").upper()
223
224
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
224
225
 
225
226
  self._deps = list(deps)
@@ -245,6 +246,15 @@ class GraphicalLasso(BaseTransformer):
245
246
  self.set_drop_input_cols(drop_input_cols)
246
247
  self.set_sample_weight_col(sample_weight_col)
247
248
 
249
+ def _get_rand_id(self) -> str:
250
+ """
251
+ Generate random id to be used in sproc and stage names.
252
+
253
+ Returns:
254
+ Random id string usable in sproc, table, and stage names.
255
+ """
256
+ return str(uuid4()).replace("-", "_").upper()
257
+
248
258
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
249
259
  """
250
260
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -323,7 +333,7 @@ class GraphicalLasso(BaseTransformer):
323
333
  cp.dump(self._sklearn_object, local_transform_file)
324
334
 
325
335
  # Create temp stage to run fit.
326
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
336
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
327
337
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
328
338
  SqlResultValidator(
329
339
  session=session,
@@ -336,11 +346,12 @@ class GraphicalLasso(BaseTransformer):
336
346
  expected_value=f"Stage area {transform_stage_name} successfully created."
337
347
  ).validate()
338
348
 
339
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
349
+ # Use posixpath to construct stage paths
350
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
351
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
340
352
  local_result_file_name = get_temp_file_path()
341
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
342
353
 
343
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
354
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
344
355
  statement_params = telemetry.get_function_usage_statement_params(
345
356
  project=_PROJECT,
346
357
  subproject=_SUBPROJECT,
@@ -366,6 +377,7 @@ class GraphicalLasso(BaseTransformer):
366
377
  replace=True,
367
378
  session=session,
368
379
  statement_params=statement_params,
380
+ anonymous=True
369
381
  )
370
382
  def fit_wrapper_sproc(
371
383
  session: Session,
@@ -374,7 +386,8 @@ class GraphicalLasso(BaseTransformer):
374
386
  stage_result_file_name: str,
375
387
  input_cols: List[str],
376
388
  label_cols: List[str],
377
- sample_weight_col: Optional[str]
389
+ sample_weight_col: Optional[str],
390
+ statement_params: Dict[str, str]
378
391
  ) -> str:
379
392
  import cloudpickle as cp
380
393
  import numpy as np
@@ -441,15 +454,15 @@ class GraphicalLasso(BaseTransformer):
441
454
  api_calls=[Session.call],
442
455
  custom_tags=dict([("autogen", True)]),
443
456
  )
444
- sproc_export_file_name = session.call(
445
- fit_sproc_name,
457
+ sproc_export_file_name = fit_wrapper_sproc(
458
+ session,
446
459
  query,
447
460
  stage_transform_file_name,
448
461
  stage_result_file_name,
449
462
  identifier.get_unescaped_names(self.input_cols),
450
463
  identifier.get_unescaped_names(self.label_cols),
451
464
  identifier.get_unescaped_names(self.sample_weight_col),
452
- statement_params=statement_params,
465
+ statement_params,
453
466
  )
454
467
 
455
468
  if "|" in sproc_export_file_name:
@@ -459,7 +472,7 @@ class GraphicalLasso(BaseTransformer):
459
472
  print("\n".join(fields[1:]))
460
473
 
461
474
  session.file.get(
462
- os.path.join(stage_result_file_name, sproc_export_file_name),
475
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
463
476
  local_result_file_name,
464
477
  statement_params=statement_params
465
478
  )
@@ -505,7 +518,7 @@ class GraphicalLasso(BaseTransformer):
505
518
 
506
519
  # Register vectorized UDF for batch inference
507
520
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
508
- safe_id=self.id, method=inference_method)
521
+ safe_id=self._get_rand_id(), method=inference_method)
509
522
 
510
523
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
511
524
  # will try to pickle all of self which fails.
@@ -597,7 +610,7 @@ class GraphicalLasso(BaseTransformer):
597
610
  return transformed_pandas_df.to_dict("records")
598
611
 
599
612
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
600
- safe_id=self.id
613
+ safe_id=self._get_rand_id()
601
614
  )
602
615
 
603
616
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -653,26 +666,37 @@ class GraphicalLasso(BaseTransformer):
653
666
  # input cols need to match unquoted / quoted
654
667
  input_cols = self.input_cols
655
668
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
669
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
656
670
 
657
671
  estimator = self._sklearn_object
658
672
 
659
- input_df = dataset[input_cols] # Select input columns with quoted column names.
660
- if hasattr(estimator, "feature_names_in_"):
661
- missing_features = []
662
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
663
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
664
- missing_features.append(f)
665
-
666
- if len(missing_features) > 0:
667
- raise ValueError(
668
- "The feature names should match with those that were passed during fit.\n"
669
- f"Features seen during fit call but not present in the input: {missing_features}\n"
670
- f"Features in the input dataframe : {input_cols}\n"
671
- )
672
- input_df.columns = getattr(estimator, "feature_names_in_")
673
- else:
674
- # Just rename the column names to unquoted identifiers.
675
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
673
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
674
+ missing_features = []
675
+ features_in_dataset = set(dataset.columns)
676
+ columns_to_select = []
677
+ for i, f in enumerate(features_required_by_estimator):
678
+ if (
679
+ i >= len(input_cols)
680
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
681
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
682
+ and quoted_input_cols[i] not in features_in_dataset)
683
+ ):
684
+ missing_features.append(f)
685
+ elif input_cols[i] in features_in_dataset:
686
+ columns_to_select.append(input_cols[i])
687
+ elif unquoted_input_cols[i] in features_in_dataset:
688
+ columns_to_select.append(unquoted_input_cols[i])
689
+ else:
690
+ columns_to_select.append(quoted_input_cols[i])
691
+
692
+ if len(missing_features) > 0:
693
+ raise ValueError(
694
+ "The feature names should match with those that were passed during fit.\n"
695
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
696
+ f"Features in the input dataframe : {input_cols}\n"
697
+ )
698
+ input_df = dataset[columns_to_select]
699
+ input_df.columns = features_required_by_estimator
676
700
 
677
701
  transformed_numpy_array = getattr(estimator, inference_method)(
678
702
  input_df
@@ -751,11 +775,18 @@ class GraphicalLasso(BaseTransformer):
751
775
  Transformed dataset.
752
776
  """
753
777
  if isinstance(dataset, DataFrame):
778
+ expected_type_inferred = ""
779
+ # when it is classifier, infer the datatype from label columns
780
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
781
+ expected_type_inferred = convert_sp_to_sf_type(
782
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
783
+ )
784
+
754
785
  output_df = self._batch_inference(
755
786
  dataset=dataset,
756
787
  inference_method="predict",
757
788
  expected_output_cols_list=self.output_cols,
758
- expected_output_cols_type="",
789
+ expected_output_cols_type=expected_type_inferred,
759
790
  )
760
791
  elif isinstance(dataset, pd.DataFrame):
761
792
  output_df = self._sklearn_inference(
@@ -826,10 +857,10 @@ class GraphicalLasso(BaseTransformer):
826
857
 
827
858
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
828
859
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
829
- Returns an empty list if current object is not a classifier or not yet fitted.
860
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
830
861
  """
831
862
  if getattr(self._sklearn_object, "classes_", None) is None:
832
- return []
863
+ return [output_cols_prefix]
833
864
 
834
865
  classes = self._sklearn_object.classes_
835
866
  if isinstance(classes, numpy.ndarray):
@@ -1054,7 +1085,7 @@ class GraphicalLasso(BaseTransformer):
1054
1085
  cp.dump(self._sklearn_object, local_score_file)
1055
1086
 
1056
1087
  # Create temp stage to run score.
1057
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1088
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1058
1089
  session = dataset._session
1059
1090
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1060
1091
  SqlResultValidator(
@@ -1068,8 +1099,9 @@ class GraphicalLasso(BaseTransformer):
1068
1099
  expected_value=f"Stage area {score_stage_name} successfully created."
1069
1100
  ).validate()
1070
1101
 
1071
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1072
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1102
+ # Use posixpath to construct stage paths
1103
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1104
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1073
1105
  statement_params = telemetry.get_function_usage_statement_params(
1074
1106
  project=_PROJECT,
1075
1107
  subproject=_SUBPROJECT,
@@ -1095,6 +1127,7 @@ class GraphicalLasso(BaseTransformer):
1095
1127
  replace=True,
1096
1128
  session=session,
1097
1129
  statement_params=statement_params,
1130
+ anonymous=True
1098
1131
  )
1099
1132
  def score_wrapper_sproc(
1100
1133
  session: Session,
@@ -1102,7 +1135,8 @@ class GraphicalLasso(BaseTransformer):
1102
1135
  stage_score_file_name: str,
1103
1136
  input_cols: List[str],
1104
1137
  label_cols: List[str],
1105
- sample_weight_col: Optional[str]
1138
+ sample_weight_col: Optional[str],
1139
+ statement_params: Dict[str, str]
1106
1140
  ) -> float:
1107
1141
  import cloudpickle as cp
1108
1142
  import numpy as np
@@ -1152,14 +1186,14 @@ class GraphicalLasso(BaseTransformer):
1152
1186
  api_calls=[Session.call],
1153
1187
  custom_tags=dict([("autogen", True)]),
1154
1188
  )
1155
- score = session.call(
1156
- score_sproc_name,
1189
+ score = score_wrapper_sproc(
1190
+ session,
1157
1191
  query,
1158
1192
  stage_score_file_name,
1159
1193
  identifier.get_unescaped_names(self.input_cols),
1160
1194
  identifier.get_unescaped_names(self.label_cols),
1161
1195
  identifier.get_unescaped_names(self.sample_weight_col),
1162
- statement_params=statement_params,
1196
+ statement_params,
1163
1197
  )
1164
1198
 
1165
1199
  cleanup_temp_files([local_score_file_name])
@@ -1177,18 +1211,20 @@ class GraphicalLasso(BaseTransformer):
1177
1211
  if self._sklearn_object._estimator_type == 'classifier':
1178
1212
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1179
1213
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1180
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1214
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1215
+ ([] if self._drop_input_cols else inputs) + outputs)
1181
1216
  # For regressor, the type of predict is float64
1182
1217
  elif self._sklearn_object._estimator_type == 'regressor':
1183
1218
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1184
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1185
-
1219
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1220
+ ([] if self._drop_input_cols else inputs) + outputs)
1186
1221
  for prob_func in PROB_FUNCTIONS:
1187
1222
  if hasattr(self, prob_func):
1188
1223
  output_cols_prefix: str = f"{prob_func}_"
1189
1224
  output_column_names = self._get_output_column_names(output_cols_prefix)
1190
1225
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1191
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1226
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1227
+ ([] if self._drop_input_cols else inputs) + outputs)
1192
1228
 
1193
1229
  @property
1194
1230
  def model_signatures(self) -> Dict[str, ModelSignature]: