snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. snowflake/ml/_internal/env_utils.py +2 -1
  2. snowflake/ml/_internal/file_utils.py +35 -40
  3. snowflake/ml/_internal/telemetry.py +5 -8
  4. snowflake/ml/_internal/utils/identifier.py +74 -7
  5. snowflake/ml/_internal/utils/uri.py +7 -2
  6. snowflake/ml/model/_core_requirements.py +1 -1
  7. snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
  8. snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
  9. snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
  10. snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
  11. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
  12. snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
  13. snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
  14. snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
  15. snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
  16. snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
  17. snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
  18. snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
  19. snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
  20. snowflake/ml/model/_deployer.py +14 -27
  21. snowflake/ml/model/_env.py +4 -4
  22. snowflake/ml/model/_handlers/_base.py +3 -1
  23. snowflake/ml/model/_handlers/custom.py +14 -2
  24. snowflake/ml/model/_handlers/pytorch.py +186 -0
  25. snowflake/ml/model/_handlers/sklearn.py +14 -8
  26. snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
  27. snowflake/ml/model/_handlers/torchscript.py +180 -0
  28. snowflake/ml/model/_handlers/xgboost.py +19 -9
  29. snowflake/ml/model/_model.py +27 -21
  30. snowflake/ml/model/_model_meta.py +33 -19
  31. snowflake/ml/model/model_signature.py +446 -66
  32. snowflake/ml/model/type_hints.py +28 -15
  33. snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
  34. snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
  35. snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
  36. snowflake/ml/modeling/cluster/birch.py +79 -43
  37. snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
  38. snowflake/ml/modeling/cluster/dbscan.py +79 -43
  39. snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
  40. snowflake/ml/modeling/cluster/k_means.py +79 -43
  41. snowflake/ml/modeling/cluster/mean_shift.py +79 -43
  42. snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
  43. snowflake/ml/modeling/cluster/optics.py +79 -43
  44. snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
  45. snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
  46. snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
  47. snowflake/ml/modeling/compose/column_transformer.py +79 -43
  48. snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
  49. snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
  50. snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
  51. snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
  52. snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
  53. snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
  54. snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
  55. snowflake/ml/modeling/covariance/oas.py +79 -43
  56. snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
  57. snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
  58. snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
  59. snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
  60. snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
  61. snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
  62. snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
  63. snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
  64. snowflake/ml/modeling/decomposition/pca.py +79 -43
  65. snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
  66. snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
  67. snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
  68. snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
  69. snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
  70. snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
  71. snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
  72. snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
  73. snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
  74. snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
  75. snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
  76. snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
  77. snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
  78. snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
  79. snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
  80. snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
  81. snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
  82. snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
  83. snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
  84. snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
  85. snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
  86. snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
  87. snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
  88. snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
  89. snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
  90. snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
  91. snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
  92. snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
  93. snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
  94. snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
  95. snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
  96. snowflake/ml/modeling/impute/knn_imputer.py +79 -43
  97. snowflake/ml/modeling/impute/missing_indicator.py +79 -43
  98. snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
  99. snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
  100. snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
  101. snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
  102. snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
  103. snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
  104. snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
  105. snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
  106. snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
  107. snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
  108. snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
  109. snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
  110. snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
  111. snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
  112. snowflake/ml/modeling/linear_model/lars.py +79 -43
  113. snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
  114. snowflake/ml/modeling/linear_model/lasso.py +79 -43
  115. snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
  116. snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
  117. snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
  118. snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
  119. snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
  120. snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
  121. snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
  122. snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
  123. snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
  124. snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
  125. snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
  126. snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
  127. snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
  128. snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
  129. snowflake/ml/modeling/linear_model/perceptron.py +79 -43
  130. snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
  131. snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
  132. snowflake/ml/modeling/linear_model/ridge.py +79 -43
  133. snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
  134. snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
  135. snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
  136. snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
  137. snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
  138. snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
  139. snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
  140. snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
  141. snowflake/ml/modeling/manifold/isomap.py +79 -43
  142. snowflake/ml/modeling/manifold/mds.py +79 -43
  143. snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
  144. snowflake/ml/modeling/manifold/tsne.py +79 -43
  145. snowflake/ml/modeling/metrics/classification.py +6 -1
  146. snowflake/ml/modeling/metrics/regression.py +517 -9
  147. snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
  148. snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
  149. snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
  150. snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
  151. snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
  152. snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
  153. snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
  154. snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
  155. snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
  156. snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
  157. snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
  158. snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
  159. snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
  160. snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
  161. snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
  162. snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
  163. snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
  164. snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
  165. snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
  166. snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
  167. snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
  168. snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
  169. snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
  170. snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
  171. snowflake/ml/modeling/pipeline/pipeline.py +24 -0
  172. snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
  173. snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
  174. snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
  175. snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
  176. snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
  177. snowflake/ml/modeling/svm/linear_svc.py +79 -43
  178. snowflake/ml/modeling/svm/linear_svr.py +79 -43
  179. snowflake/ml/modeling/svm/nu_svc.py +79 -43
  180. snowflake/ml/modeling/svm/nu_svr.py +79 -43
  181. snowflake/ml/modeling/svm/svc.py +79 -43
  182. snowflake/ml/modeling/svm/svr.py +79 -43
  183. snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
  184. snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
  185. snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
  186. snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
  187. snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
  188. snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
  189. snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
  190. snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
  191. snowflake/ml/registry/model_registry.py +123 -121
  192. snowflake/ml/version.py +1 -1
  193. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
  194. snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
  195. snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
  196. {snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -232,7 +234,6 @@ class FactorAnalysis(BaseTransformer):
232
234
  sample_weight_col: Optional[str] = None,
233
235
  ) -> None:
234
236
  super().__init__()
235
- self.id = str(uuid4()).replace("-", "_").upper()
236
237
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
237
238
 
238
239
  self._deps = list(deps)
@@ -260,6 +261,15 @@ class FactorAnalysis(BaseTransformer):
260
261
  self.set_drop_input_cols(drop_input_cols)
261
262
  self.set_sample_weight_col(sample_weight_col)
262
263
 
264
+ def _get_rand_id(self) -> str:
265
+ """
266
+ Generate random id to be used in sproc and stage names.
267
+
268
+ Returns:
269
+ Random id string usable in sproc, table, and stage names.
270
+ """
271
+ return str(uuid4()).replace("-", "_").upper()
272
+
263
273
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
264
274
  """
265
275
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -338,7 +348,7 @@ class FactorAnalysis(BaseTransformer):
338
348
  cp.dump(self._sklearn_object, local_transform_file)
339
349
 
340
350
  # Create temp stage to run fit.
341
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
351
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
342
352
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
343
353
  SqlResultValidator(
344
354
  session=session,
@@ -351,11 +361,12 @@ class FactorAnalysis(BaseTransformer):
351
361
  expected_value=f"Stage area {transform_stage_name} successfully created."
352
362
  ).validate()
353
363
 
354
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
364
+ # Use posixpath to construct stage paths
365
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
366
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
355
367
  local_result_file_name = get_temp_file_path()
356
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
357
368
 
358
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
369
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
359
370
  statement_params = telemetry.get_function_usage_statement_params(
360
371
  project=_PROJECT,
361
372
  subproject=_SUBPROJECT,
@@ -381,6 +392,7 @@ class FactorAnalysis(BaseTransformer):
381
392
  replace=True,
382
393
  session=session,
383
394
  statement_params=statement_params,
395
+ anonymous=True
384
396
  )
385
397
  def fit_wrapper_sproc(
386
398
  session: Session,
@@ -389,7 +401,8 @@ class FactorAnalysis(BaseTransformer):
389
401
  stage_result_file_name: str,
390
402
  input_cols: List[str],
391
403
  label_cols: List[str],
392
- sample_weight_col: Optional[str]
404
+ sample_weight_col: Optional[str],
405
+ statement_params: Dict[str, str]
393
406
  ) -> str:
394
407
  import cloudpickle as cp
395
408
  import numpy as np
@@ -456,15 +469,15 @@ class FactorAnalysis(BaseTransformer):
456
469
  api_calls=[Session.call],
457
470
  custom_tags=dict([("autogen", True)]),
458
471
  )
459
- sproc_export_file_name = session.call(
460
- fit_sproc_name,
472
+ sproc_export_file_name = fit_wrapper_sproc(
473
+ session,
461
474
  query,
462
475
  stage_transform_file_name,
463
476
  stage_result_file_name,
464
477
  identifier.get_unescaped_names(self.input_cols),
465
478
  identifier.get_unescaped_names(self.label_cols),
466
479
  identifier.get_unescaped_names(self.sample_weight_col),
467
- statement_params=statement_params,
480
+ statement_params,
468
481
  )
469
482
 
470
483
  if "|" in sproc_export_file_name:
@@ -474,7 +487,7 @@ class FactorAnalysis(BaseTransformer):
474
487
  print("\n".join(fields[1:]))
475
488
 
476
489
  session.file.get(
477
- os.path.join(stage_result_file_name, sproc_export_file_name),
490
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
478
491
  local_result_file_name,
479
492
  statement_params=statement_params
480
493
  )
@@ -520,7 +533,7 @@ class FactorAnalysis(BaseTransformer):
520
533
 
521
534
  # Register vectorized UDF for batch inference
522
535
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
523
- safe_id=self.id, method=inference_method)
536
+ safe_id=self._get_rand_id(), method=inference_method)
524
537
 
525
538
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
526
539
  # will try to pickle all of self which fails.
@@ -612,7 +625,7 @@ class FactorAnalysis(BaseTransformer):
612
625
  return transformed_pandas_df.to_dict("records")
613
626
 
614
627
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
615
- safe_id=self.id
628
+ safe_id=self._get_rand_id()
616
629
  )
617
630
 
618
631
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -668,26 +681,37 @@ class FactorAnalysis(BaseTransformer):
668
681
  # input cols need to match unquoted / quoted
669
682
  input_cols = self.input_cols
670
683
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
684
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
671
685
 
672
686
  estimator = self._sklearn_object
673
687
 
674
- input_df = dataset[input_cols] # Select input columns with quoted column names.
675
- if hasattr(estimator, "feature_names_in_"):
676
- missing_features = []
677
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
678
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
679
- missing_features.append(f)
680
-
681
- if len(missing_features) > 0:
682
- raise ValueError(
683
- "The feature names should match with those that were passed during fit.\n"
684
- f"Features seen during fit call but not present in the input: {missing_features}\n"
685
- f"Features in the input dataframe : {input_cols}\n"
686
- )
687
- input_df.columns = getattr(estimator, "feature_names_in_")
688
- else:
689
- # Just rename the column names to unquoted identifiers.
690
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
688
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
689
+ missing_features = []
690
+ features_in_dataset = set(dataset.columns)
691
+ columns_to_select = []
692
+ for i, f in enumerate(features_required_by_estimator):
693
+ if (
694
+ i >= len(input_cols)
695
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
696
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
697
+ and quoted_input_cols[i] not in features_in_dataset)
698
+ ):
699
+ missing_features.append(f)
700
+ elif input_cols[i] in features_in_dataset:
701
+ columns_to_select.append(input_cols[i])
702
+ elif unquoted_input_cols[i] in features_in_dataset:
703
+ columns_to_select.append(unquoted_input_cols[i])
704
+ else:
705
+ columns_to_select.append(quoted_input_cols[i])
706
+
707
+ if len(missing_features) > 0:
708
+ raise ValueError(
709
+ "The feature names should match with those that were passed during fit.\n"
710
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
711
+ f"Features in the input dataframe : {input_cols}\n"
712
+ )
713
+ input_df = dataset[columns_to_select]
714
+ input_df.columns = features_required_by_estimator
691
715
 
692
716
  transformed_numpy_array = getattr(estimator, inference_method)(
693
717
  input_df
@@ -766,11 +790,18 @@ class FactorAnalysis(BaseTransformer):
766
790
  Transformed dataset.
767
791
  """
768
792
  if isinstance(dataset, DataFrame):
793
+ expected_type_inferred = ""
794
+ # when it is classifier, infer the datatype from label columns
795
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
796
+ expected_type_inferred = convert_sp_to_sf_type(
797
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
798
+ )
799
+
769
800
  output_df = self._batch_inference(
770
801
  dataset=dataset,
771
802
  inference_method="predict",
772
803
  expected_output_cols_list=self.output_cols,
773
- expected_output_cols_type="",
804
+ expected_output_cols_type=expected_type_inferred,
774
805
  )
775
806
  elif isinstance(dataset, pd.DataFrame):
776
807
  output_df = self._sklearn_inference(
@@ -843,10 +874,10 @@ class FactorAnalysis(BaseTransformer):
843
874
 
844
875
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
845
876
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
846
- Returns an empty list if current object is not a classifier or not yet fitted.
877
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
847
878
  """
848
879
  if getattr(self._sklearn_object, "classes_", None) is None:
849
- return []
880
+ return [output_cols_prefix]
850
881
 
851
882
  classes = self._sklearn_object.classes_
852
883
  if isinstance(classes, numpy.ndarray):
@@ -1071,7 +1102,7 @@ class FactorAnalysis(BaseTransformer):
1071
1102
  cp.dump(self._sklearn_object, local_score_file)
1072
1103
 
1073
1104
  # Create temp stage to run score.
1074
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1105
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1075
1106
  session = dataset._session
1076
1107
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1077
1108
  SqlResultValidator(
@@ -1085,8 +1116,9 @@ class FactorAnalysis(BaseTransformer):
1085
1116
  expected_value=f"Stage area {score_stage_name} successfully created."
1086
1117
  ).validate()
1087
1118
 
1088
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1089
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1119
+ # Use posixpath to construct stage paths
1120
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1121
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1090
1122
  statement_params = telemetry.get_function_usage_statement_params(
1091
1123
  project=_PROJECT,
1092
1124
  subproject=_SUBPROJECT,
@@ -1112,6 +1144,7 @@ class FactorAnalysis(BaseTransformer):
1112
1144
  replace=True,
1113
1145
  session=session,
1114
1146
  statement_params=statement_params,
1147
+ anonymous=True
1115
1148
  )
1116
1149
  def score_wrapper_sproc(
1117
1150
  session: Session,
@@ -1119,7 +1152,8 @@ class FactorAnalysis(BaseTransformer):
1119
1152
  stage_score_file_name: str,
1120
1153
  input_cols: List[str],
1121
1154
  label_cols: List[str],
1122
- sample_weight_col: Optional[str]
1155
+ sample_weight_col: Optional[str],
1156
+ statement_params: Dict[str, str]
1123
1157
  ) -> float:
1124
1158
  import cloudpickle as cp
1125
1159
  import numpy as np
@@ -1169,14 +1203,14 @@ class FactorAnalysis(BaseTransformer):
1169
1203
  api_calls=[Session.call],
1170
1204
  custom_tags=dict([("autogen", True)]),
1171
1205
  )
1172
- score = session.call(
1173
- score_sproc_name,
1206
+ score = score_wrapper_sproc(
1207
+ session,
1174
1208
  query,
1175
1209
  stage_score_file_name,
1176
1210
  identifier.get_unescaped_names(self.input_cols),
1177
1211
  identifier.get_unescaped_names(self.label_cols),
1178
1212
  identifier.get_unescaped_names(self.sample_weight_col),
1179
- statement_params=statement_params,
1213
+ statement_params,
1180
1214
  )
1181
1215
 
1182
1216
  cleanup_temp_files([local_score_file_name])
@@ -1194,18 +1228,20 @@ class FactorAnalysis(BaseTransformer):
1194
1228
  if self._sklearn_object._estimator_type == 'classifier':
1195
1229
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1196
1230
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1197
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1231
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1232
+ ([] if self._drop_input_cols else inputs) + outputs)
1198
1233
  # For regressor, the type of predict is float64
1199
1234
  elif self._sklearn_object._estimator_type == 'regressor':
1200
1235
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1201
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1202
-
1236
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1237
+ ([] if self._drop_input_cols else inputs) + outputs)
1203
1238
  for prob_func in PROB_FUNCTIONS:
1204
1239
  if hasattr(self, prob_func):
1205
1240
  output_cols_prefix: str = f"{prob_func}_"
1206
1241
  output_column_names = self._get_output_column_names(output_cols_prefix)
1207
1242
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1208
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1243
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1244
+ ([] if self._drop_input_cols else inputs) + outputs)
1209
1245
 
1210
1246
  @property
1211
1247
  def model_signatures(self) -> Dict[str, ModelSignature]:
@@ -7,6 +7,7 @@
7
7
  #
8
8
  import inspect
9
9
  import os
10
+ import posixpath
10
11
  from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
11
12
  from uuid import uuid4
12
13
 
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
27
28
  from snowflake.snowpark import DataFrame, Session
28
29
  from snowflake.snowpark.functions import pandas_udf, sproc
29
30
  from snowflake.snowpark.types import PandasSeries
31
+ from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
30
32
 
31
33
  from snowflake.ml.model.model_signature import (
32
34
  DataType,
@@ -249,7 +251,6 @@ class FastICA(BaseTransformer):
249
251
  sample_weight_col: Optional[str] = None,
250
252
  ) -> None:
251
253
  super().__init__()
252
- self.id = str(uuid4()).replace("-", "_").upper()
253
254
  deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
254
255
 
255
256
  self._deps = list(deps)
@@ -278,6 +279,15 @@ class FastICA(BaseTransformer):
278
279
  self.set_drop_input_cols(drop_input_cols)
279
280
  self.set_sample_weight_col(sample_weight_col)
280
281
 
282
+ def _get_rand_id(self) -> str:
283
+ """
284
+ Generate random id to be used in sproc and stage names.
285
+
286
+ Returns:
287
+ Random id string usable in sproc, table, and stage names.
288
+ """
289
+ return str(uuid4()).replace("-", "_").upper()
290
+
281
291
  def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
282
292
  """
283
293
  Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -356,7 +366,7 @@ class FastICA(BaseTransformer):
356
366
  cp.dump(self._sklearn_object, local_transform_file)
357
367
 
358
368
  # Create temp stage to run fit.
359
- transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
369
+ transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
360
370
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
361
371
  SqlResultValidator(
362
372
  session=session,
@@ -369,11 +379,12 @@ class FastICA(BaseTransformer):
369
379
  expected_value=f"Stage area {transform_stage_name} successfully created."
370
380
  ).validate()
371
381
 
372
- stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
382
+ # Use posixpath to construct stage paths
383
+ stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
384
+ stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
373
385
  local_result_file_name = get_temp_file_path()
374
- stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
375
386
 
376
- fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
387
+ fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
377
388
  statement_params = telemetry.get_function_usage_statement_params(
378
389
  project=_PROJECT,
379
390
  subproject=_SUBPROJECT,
@@ -399,6 +410,7 @@ class FastICA(BaseTransformer):
399
410
  replace=True,
400
411
  session=session,
401
412
  statement_params=statement_params,
413
+ anonymous=True
402
414
  )
403
415
  def fit_wrapper_sproc(
404
416
  session: Session,
@@ -407,7 +419,8 @@ class FastICA(BaseTransformer):
407
419
  stage_result_file_name: str,
408
420
  input_cols: List[str],
409
421
  label_cols: List[str],
410
- sample_weight_col: Optional[str]
422
+ sample_weight_col: Optional[str],
423
+ statement_params: Dict[str, str]
411
424
  ) -> str:
412
425
  import cloudpickle as cp
413
426
  import numpy as np
@@ -474,15 +487,15 @@ class FastICA(BaseTransformer):
474
487
  api_calls=[Session.call],
475
488
  custom_tags=dict([("autogen", True)]),
476
489
  )
477
- sproc_export_file_name = session.call(
478
- fit_sproc_name,
490
+ sproc_export_file_name = fit_wrapper_sproc(
491
+ session,
479
492
  query,
480
493
  stage_transform_file_name,
481
494
  stage_result_file_name,
482
495
  identifier.get_unescaped_names(self.input_cols),
483
496
  identifier.get_unescaped_names(self.label_cols),
484
497
  identifier.get_unescaped_names(self.sample_weight_col),
485
- statement_params=statement_params,
498
+ statement_params,
486
499
  )
487
500
 
488
501
  if "|" in sproc_export_file_name:
@@ -492,7 +505,7 @@ class FastICA(BaseTransformer):
492
505
  print("\n".join(fields[1:]))
493
506
 
494
507
  session.file.get(
495
- os.path.join(stage_result_file_name, sproc_export_file_name),
508
+ posixpath.join(stage_result_file_name, sproc_export_file_name),
496
509
  local_result_file_name,
497
510
  statement_params=statement_params
498
511
  )
@@ -538,7 +551,7 @@ class FastICA(BaseTransformer):
538
551
 
539
552
  # Register vectorized UDF for batch inference
540
553
  batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
541
- safe_id=self.id, method=inference_method)
554
+ safe_id=self._get_rand_id(), method=inference_method)
542
555
 
543
556
  # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
544
557
  # will try to pickle all of self which fails.
@@ -630,7 +643,7 @@ class FastICA(BaseTransformer):
630
643
  return transformed_pandas_df.to_dict("records")
631
644
 
632
645
  batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
633
- safe_id=self.id
646
+ safe_id=self._get_rand_id()
634
647
  )
635
648
 
636
649
  pass_through_columns = self._get_pass_through_columns(dataset)
@@ -686,26 +699,37 @@ class FastICA(BaseTransformer):
686
699
  # input cols need to match unquoted / quoted
687
700
  input_cols = self.input_cols
688
701
  unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
702
+ quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
689
703
 
690
704
  estimator = self._sklearn_object
691
705
 
692
- input_df = dataset[input_cols] # Select input columns with quoted column names.
693
- if hasattr(estimator, "feature_names_in_"):
694
- missing_features = []
695
- for i, f in enumerate(getattr(estimator, "feature_names_in_")):
696
- if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
697
- missing_features.append(f)
698
-
699
- if len(missing_features) > 0:
700
- raise ValueError(
701
- "The feature names should match with those that were passed during fit.\n"
702
- f"Features seen during fit call but not present in the input: {missing_features}\n"
703
- f"Features in the input dataframe : {input_cols}\n"
704
- )
705
- input_df.columns = getattr(estimator, "feature_names_in_")
706
- else:
707
- # Just rename the column names to unquoted identifiers.
708
- input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
706
+ features_required_by_estimator = getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
707
+ missing_features = []
708
+ features_in_dataset = set(dataset.columns)
709
+ columns_to_select = []
710
+ for i, f in enumerate(features_required_by_estimator):
711
+ if (
712
+ i >= len(input_cols)
713
+ or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
714
+ or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
715
+ and quoted_input_cols[i] not in features_in_dataset)
716
+ ):
717
+ missing_features.append(f)
718
+ elif input_cols[i] in features_in_dataset:
719
+ columns_to_select.append(input_cols[i])
720
+ elif unquoted_input_cols[i] in features_in_dataset:
721
+ columns_to_select.append(unquoted_input_cols[i])
722
+ else:
723
+ columns_to_select.append(quoted_input_cols[i])
724
+
725
+ if len(missing_features) > 0:
726
+ raise ValueError(
727
+ "The feature names should match with those that were passed during fit.\n"
728
+ f"Features seen during fit call but not present in the input: {missing_features}\n"
729
+ f"Features in the input dataframe : {input_cols}\n"
730
+ )
731
+ input_df = dataset[columns_to_select]
732
+ input_df.columns = features_required_by_estimator
709
733
 
710
734
  transformed_numpy_array = getattr(estimator, inference_method)(
711
735
  input_df
@@ -784,11 +808,18 @@ class FastICA(BaseTransformer):
784
808
  Transformed dataset.
785
809
  """
786
810
  if isinstance(dataset, DataFrame):
811
+ expected_type_inferred = ""
812
+ # when it is classifier, infer the datatype from label columns
813
+ if expected_type_inferred == "" and 'predict' in self.model_signatures:
814
+ expected_type_inferred = convert_sp_to_sf_type(
815
+ self.model_signatures['predict'].outputs[0].as_snowpark_type()
816
+ )
817
+
787
818
  output_df = self._batch_inference(
788
819
  dataset=dataset,
789
820
  inference_method="predict",
790
821
  expected_output_cols_list=self.output_cols,
791
- expected_output_cols_type="",
822
+ expected_output_cols_type=expected_type_inferred,
792
823
  )
793
824
  elif isinstance(dataset, pd.DataFrame):
794
825
  output_df = self._sklearn_inference(
@@ -861,10 +892,10 @@ class FastICA(BaseTransformer):
861
892
 
862
893
  def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
863
894
  """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
864
- Returns an empty list if current object is not a classifier or not yet fitted.
895
+ Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
865
896
  """
866
897
  if getattr(self._sklearn_object, "classes_", None) is None:
867
- return []
898
+ return [output_cols_prefix]
868
899
 
869
900
  classes = self._sklearn_object.classes_
870
901
  if isinstance(classes, numpy.ndarray):
@@ -1089,7 +1120,7 @@ class FastICA(BaseTransformer):
1089
1120
  cp.dump(self._sklearn_object, local_score_file)
1090
1121
 
1091
1122
  # Create temp stage to run score.
1092
- score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1123
+ score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1093
1124
  session = dataset._session
1094
1125
  stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
1095
1126
  SqlResultValidator(
@@ -1103,8 +1134,9 @@ class FastICA(BaseTransformer):
1103
1134
  expected_value=f"Stage area {score_stage_name} successfully created."
1104
1135
  ).validate()
1105
1136
 
1106
- stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
1107
- score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
1137
+ # Use posixpath to construct stage paths
1138
+ stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
1139
+ score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
1108
1140
  statement_params = telemetry.get_function_usage_statement_params(
1109
1141
  project=_PROJECT,
1110
1142
  subproject=_SUBPROJECT,
@@ -1130,6 +1162,7 @@ class FastICA(BaseTransformer):
1130
1162
  replace=True,
1131
1163
  session=session,
1132
1164
  statement_params=statement_params,
1165
+ anonymous=True
1133
1166
  )
1134
1167
  def score_wrapper_sproc(
1135
1168
  session: Session,
@@ -1137,7 +1170,8 @@ class FastICA(BaseTransformer):
1137
1170
  stage_score_file_name: str,
1138
1171
  input_cols: List[str],
1139
1172
  label_cols: List[str],
1140
- sample_weight_col: Optional[str]
1173
+ sample_weight_col: Optional[str],
1174
+ statement_params: Dict[str, str]
1141
1175
  ) -> float:
1142
1176
  import cloudpickle as cp
1143
1177
  import numpy as np
@@ -1187,14 +1221,14 @@ class FastICA(BaseTransformer):
1187
1221
  api_calls=[Session.call],
1188
1222
  custom_tags=dict([("autogen", True)]),
1189
1223
  )
1190
- score = session.call(
1191
- score_sproc_name,
1224
+ score = score_wrapper_sproc(
1225
+ session,
1192
1226
  query,
1193
1227
  stage_score_file_name,
1194
1228
  identifier.get_unescaped_names(self.input_cols),
1195
1229
  identifier.get_unescaped_names(self.label_cols),
1196
1230
  identifier.get_unescaped_names(self.sample_weight_col),
1197
- statement_params=statement_params,
1231
+ statement_params,
1198
1232
  )
1199
1233
 
1200
1234
  cleanup_temp_files([local_score_file_name])
@@ -1212,18 +1246,20 @@ class FastICA(BaseTransformer):
1212
1246
  if self._sklearn_object._estimator_type == 'classifier':
1213
1247
  outputs = _infer_signature(dataset[self.label_cols], "output") # label columns is the desired type for output
1214
1248
  outputs = _rename_features(outputs, self.output_cols) # rename the output columns
1215
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1249
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1250
+ ([] if self._drop_input_cols else inputs) + outputs)
1216
1251
  # For regressor, the type of predict is float64
1217
1252
  elif self._sklearn_object._estimator_type == 'regressor':
1218
1253
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
1219
- self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
1220
-
1254
+ self._model_signature_dict["predict"] = ModelSignature(inputs,
1255
+ ([] if self._drop_input_cols else inputs) + outputs)
1221
1256
  for prob_func in PROB_FUNCTIONS:
1222
1257
  if hasattr(self, prob_func):
1223
1258
  output_cols_prefix: str = f"{prob_func}_"
1224
1259
  output_column_names = self._get_output_column_names(output_cols_prefix)
1225
1260
  outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
1226
- self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
1261
+ self._model_signature_dict[prob_func] = ModelSignature(inputs,
1262
+ ([] if self._drop_input_cols else inputs) + outputs)
1227
1263
 
1228
1264
  @property
1229
1265
  def model_signatures(self) -> Dict[str, ModelSignature]: